[FastTokenizer] Add clip python tokenizer (#3805)

joey12300 · web-flow · commit d806c2bb4516 · 2022-11-18T11:45:35.000+08:00
* Add clip fast tokenizer

* fix cpp readme
diff --git a/fast_tokenizer/docs/cpp/README.md b/fast_tokenizer/docs/cpp/README.md
@@ -63,4 +63,5 @@ fast_tokenizer
 目前FastTokenizer提供了以下C++使用示例。
 
 [ErnieFastTokenizer C++示例](../../examples/ernie/)
+
 [ClipFastTokenizer C++示例](../../examples/clip/)
diff --git a/fast_tokenizer/examples/ernie-3.0/python/README.md b/fast_tokenizer/examples/ernie-3.0/python/README.md
diff --git a/fast_tokenizer/fast_tokenizer/pybind/pretokenizers.cc b/fast_tokenizer/fast_tokenizer/pybind/pretokenizers.cc
@@ -195,11 +195,17 @@ void BindPreTokenizers(pybind11::module* m) {
                  pretokenizer_ptr =
                      py_pretokenizer
                          .cast<pretokenizers::ByteLevelPreTokenizer*>();
+               } else if (py::type::of(py_pretokenizer)
+                              .is(py::type::of<
+                                  pretokenizers::SplitPreTokenizer>())) {
+                 pretokenizer_ptr =
+                     py_pretokenizer.cast<pretokenizers::SplitPreTokenizer*>();
                } else {
                  throw py::value_error(
                      "Type of normalizers should be one of `BertPreTokenizer`,"
                      " `MetaSpacePreTokenizer`, `SequencePreTokenizer`,"
-                     " `WhitespacePreTokenizer`, `ByteLevelPreTokenizer`");
+                     " `WhitespacePreTokenizer`, `ByteLevelPreTokenizer`, "
+                     "`SplitPreTokenizer`");
                }
                pretokenizers.push_back(pretokenizer_ptr);
              }
diff --git a/fast_tokenizer/python/fast_tokenizer/__init__.py b/fast_tokenizer/python/fast_tokenizer/__init__.py
@@ -550,4 +550,4 @@ def get_thread_num():
 from . import pretokenizers
 from . import postprocessors
 from . import decoders
-from .tokenizers_impl import ErnieFastTokenizer, SentencePieceBPEFastTokenizer
+from .tokenizers_impl import ErnieFastTokenizer, SentencePieceBPEFastTokenizer, ClipFastTokenizer
diff --git a/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/__init__.py b/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/__init__.py
@@ -15,3 +15,4 @@
 from .base_tokenizer import BaseFastTokenizer
 from .ernie import ErnieFastTokenizer
 from .sentencepiece_bpe import SentencePieceBPEFastTokenizer
+from .clip import ClipFastTokenizer
diff --git a/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/clip.py b/fast_tokenizer/python/fast_tokenizer/tokenizers_impl/clip.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base_tokenizer import BaseFastTokenizer
+
+from fast_tokenizer.normalizers import NFCNormalizer, ReplaceNormalizer, LowercaseNormalizer, SequenceNormalizer
+from fast_tokenizer.pretokenizers import SplitPreTokenizer, ByteLevelPreTokenizer, SequencePreTokenizer
+from fast_tokenizer.models import BPE
+from fast_tokenizer.postprocessors import RobertaPostProcessor
+from fast_tokenizer import Tokenizer, SplitMode
+
+__all__ = ['ClipFastTokenizer']
+
+
+class ClipFastTokenizer(BaseFastTokenizer):
+
+    def __init__(self,
+                 vocab=None,
+                 merges=None,
+                 max_length=None,
+                 unk_token="<|endoftext|>",
+                 pad_token="<|endoftext|>",
+                 bos_token="<|startoftext|>",
+                 eos_token="<|endoftext|>",
+                 add_prefix_space=False,
+                 continuing_subword_prefix="",
+                 end_of_word_suffix="</w>",
+                 trim_offsets=False):
+        # Init Tokenizer instance using tokenization model
+        tokenizer = Tokenizer(
+            BPE(vocab,
+                merges,
+                unk_token=unk_token,
+                continuing_subword_prefix=continuing_subword_prefix,
+                end_of_word_suffix=end_of_word_suffix,
+                fuse_unk=False))
+
+        # Add special tokens
+        bos_token_id = 0
+        eos_token_id = 1
+        if tokenizer.token_to_id(str(unk_token)) is not None:
+            tokenizer.add_special_tokens([str(unk_token)])
+        if tokenizer.token_to_id(str(pad_token)) is not None:
+            tokenizer.add_special_tokens([str(pad_token)])
+        if tokenizer.token_to_id(str(bos_token)) is not None:
+            bos_token_id = tokenizer.token_to_id(str(bos_token))
+            tokenizer.add_special_tokens([str(bos_token)])
+        if tokenizer.token_to_id(str(eos_token)) is not None:
+            eos_token_id = tokenizer.token_to_id(str(eos_token))
+            tokenizer.add_special_tokens([str(eos_token)])
+
+        # Set the normalizer
+        tokenizer.normalizer = SequenceNormalizer([
+            NFCNormalizer(),
+            ReplaceNormalizer(r"\s+", " "),
+            LowercaseNormalizer()
+        ])
+
+        # Set the pretokenizer
+        tokenizer.pretokenizer = SequencePreTokenizer([
+            SplitPreTokenizer(
+                r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+                split_mode=SplitMode.REMOVED,
+                invert=True),
+            ByteLevelPreTokenizer(add_prefix_space=False)
+        ])
+
+        # Set the postprocessor
+        tokenizer.postprocessor = RobertaPostProcessor(sep=(eos_token,
+                                                            eos_token_id),
+                                                       cls=(bos_token,
+                                                            bos_token_id),
+                                                       trim_offsets=False,
+                                                       add_prefix_space=False)
+
+        parameters = {
+            "model": "BPE",
+            "unk_token": unk_token,
+            "pad_token": pad_token,
+            "bos_token": bos_token,
+            "eos_token": eos_token,
+            "add_prefix_space": add_prefix_space,
+            "max_length": max_length,
+            "continuing_subword_prefix": continuing_subword_prefix,
+            "end_of_word_suffix": end_of_word_suffix,
+            "trim_offsets": trim_offsets
+        }
+        super().__init__(tokenizer, parameters)
diff --git a/fast_tokenizer/python/tests/test_clip_tokenizer.py b/fast_tokenizer/python/tests/test_clip_tokenizer.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import unittest
+
+import fast_tokenizer
+from fast_tokenizer import ClipFastTokenizer, models
+from paddlenlp.utils.downloader import get_path_from_url
+
+
+class TestClipFastTokenizer(unittest.TestCase):
+
+    def setUp(self):
+        vocab_path = os.path.join(os.getcwd(), "vocab.json")
+        merges_path = os.path.join(os.getcwd(), "merges.txt")
+        if not os.path.exists(vocab_path):
+            get_path_from_url(
+                "http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/vocab.json",
+                os.getcwd())
+        if not os.path.exists(merges_path):
+            get_path_from_url(
+                "http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/merges.txt",
+                os.getcwd())
+        vocab, merges = models.BPE.read_file(vocab_path, merges_path)
+        self.tokenizer = ClipFastTokenizer(vocab, merges)
+        self.expected_ids = [
+            49406, 320, 1342, 272, 272, 335, 273, 273, 274, 16368, 13439, 2971,
+            748, 531, 13610, 323, 1896, 8445, 323, 539, 320, 2368, 49407
+        ]
+        self.expected_tokens = [
+            "<|startoftext|>", "a</w>", "'ll</w>", "1</w>", "1</w>", "p</w>",
+            "2</w>", "2</w>", "3</w>", "rf</w>", "âĺĨ</w>", "ho</w>", "!!</w>",
+            "to</w>", "?'</w>", "d</w>", "'d</w>", "''</w>", "d</w>", "of</w>",
+            "a</w>", "cat</w>", "<|endoftext|>"
+        ]
+        self.input_text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
+
+    def test_encode(self):
+        result = self.tokenizer.encode(self.input_text)
+        self.assertEqual(result.tokens, self.expected_tokens)
+        self.assertEqual(result.ids, self.expected_ids)
diff --git a/fast_tokenizer/run_build_py_lib.sh b/fast_tokenizer/run_build_py_lib.sh
@@ -33,7 +33,7 @@ do
    echo "Compile with $core_num cores"
    cmake .. -DWITH_PYTHON=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
    make -j${core_num}
-   if [[$? == 0]];
+   if [[ $? == 0 ]];
    then
        echo "Successfully compile."
    else