Skip to content

Commit d806c2b

Browse files
authored
[FastTokenizer] Add clip python tokenizer (#3805)
* Add clip fast tokenizer * fix cpp readme
1 parent e002b0d commit d806c2b

File tree

8 files changed

+164
-3
lines changed

8 files changed

+164
-3
lines changed

fast_tokenizer/docs/cpp/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,5 @@ fast_tokenizer
6363
目前FastTokenizer提供了以下C++使用示例。
6464

6565
[ErnieFastTokenizer C++示例](../../examples/ernie/)
66+
6667
[ClipFastTokenizer C++示例](../../examples/clip/)

fast_tokenizer/examples/ernie-3.0/python/README.md

Whitespace-only changes.

fast_tokenizer/fast_tokenizer/pybind/pretokenizers.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,11 +195,17 @@ void BindPreTokenizers(pybind11::module* m) {
195195
pretokenizer_ptr =
196196
py_pretokenizer
197197
.cast<pretokenizers::ByteLevelPreTokenizer*>();
198+
} else if (py::type::of(py_pretokenizer)
199+
.is(py::type::of<
200+
pretokenizers::SplitPreTokenizer>())) {
201+
pretokenizer_ptr =
202+
py_pretokenizer.cast<pretokenizers::SplitPreTokenizer*>();
198203
} else {
199204
throw py::value_error(
200205
"Type of normalizers should be one of `BertPreTokenizer`,"
201206
" `MetaSpacePreTokenizer`, `SequencePreTokenizer`,"
202-
" `WhitespacePreTokenizer`, `ByteLevelPreTokenizer`");
207+
" `WhitespacePreTokenizer`, `ByteLevelPreTokenizer`, "
208+
"`SplitPreTokenizer`");
203209
}
204210
pretokenizers.push_back(pretokenizer_ptr);
205211
}

fast_tokenizer/python/fast_tokenizer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -550,4 +550,4 @@ def get_thread_num():
550550
from . import pretokenizers
551551
from . import postprocessors
552552
from . import decoders
553-
from .tokenizers_impl import ErnieFastTokenizer, SentencePieceBPEFastTokenizer
553+
from .tokenizers_impl import ErnieFastTokenizer, SentencePieceBPEFastTokenizer, ClipFastTokenizer

fast_tokenizer/python/fast_tokenizer/tokenizers_impl/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@
1515
from .base_tokenizer import BaseFastTokenizer
1616
from .ernie import ErnieFastTokenizer
1717
from .sentencepiece_bpe import SentencePieceBPEFastTokenizer
18+
from .clip import ClipFastTokenizer
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from .base_tokenizer import BaseFastTokenizer
16+
17+
from fast_tokenizer.normalizers import NFCNormalizer, ReplaceNormalizer, LowercaseNormalizer, SequenceNormalizer
18+
from fast_tokenizer.pretokenizers import SplitPreTokenizer, ByteLevelPreTokenizer, SequencePreTokenizer
19+
from fast_tokenizer.models import BPE
20+
from fast_tokenizer.postprocessors import RobertaPostProcessor
21+
from fast_tokenizer import Tokenizer, SplitMode
22+
23+
__all__ = ['ClipFastTokenizer']
24+
25+
26+
class ClipFastTokenizer(BaseFastTokenizer):
27+
28+
def __init__(self,
29+
vocab=None,
30+
merges=None,
31+
max_length=None,
32+
unk_token="<|endoftext|>",
33+
pad_token="<|endoftext|>",
34+
bos_token="<|startoftext|>",
35+
eos_token="<|endoftext|>",
36+
add_prefix_space=False,
37+
continuing_subword_prefix="",
38+
end_of_word_suffix="</w>",
39+
trim_offsets=False):
40+
# Init Tokenizer instance using tokenization model
41+
tokenizer = Tokenizer(
42+
BPE(vocab,
43+
merges,
44+
unk_token=unk_token,
45+
continuing_subword_prefix=continuing_subword_prefix,
46+
end_of_word_suffix=end_of_word_suffix,
47+
fuse_unk=False))
48+
49+
# Add special tokens
50+
bos_token_id = 0
51+
eos_token_id = 1
52+
if tokenizer.token_to_id(str(unk_token)) is not None:
53+
tokenizer.add_special_tokens([str(unk_token)])
54+
if tokenizer.token_to_id(str(pad_token)) is not None:
55+
tokenizer.add_special_tokens([str(pad_token)])
56+
if tokenizer.token_to_id(str(bos_token)) is not None:
57+
bos_token_id = tokenizer.token_to_id(str(bos_token))
58+
tokenizer.add_special_tokens([str(bos_token)])
59+
if tokenizer.token_to_id(str(eos_token)) is not None:
60+
eos_token_id = tokenizer.token_to_id(str(eos_token))
61+
tokenizer.add_special_tokens([str(eos_token)])
62+
63+
# Set the normalizer
64+
tokenizer.normalizer = SequenceNormalizer([
65+
NFCNormalizer(),
66+
ReplaceNormalizer(r"\s+", " "),
67+
LowercaseNormalizer()
68+
])
69+
70+
# Set the pretokenizer
71+
tokenizer.pretokenizer = SequencePreTokenizer([
72+
SplitPreTokenizer(
73+
r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
74+
split_mode=SplitMode.REMOVED,
75+
invert=True),
76+
ByteLevelPreTokenizer(add_prefix_space=False)
77+
])
78+
79+
# Set the postprocessor
80+
tokenizer.postprocessor = RobertaPostProcessor(sep=(eos_token,
81+
eos_token_id),
82+
cls=(bos_token,
83+
bos_token_id),
84+
trim_offsets=False,
85+
add_prefix_space=False)
86+
87+
parameters = {
88+
"model": "BPE",
89+
"unk_token": unk_token,
90+
"pad_token": pad_token,
91+
"bos_token": bos_token,
92+
"eos_token": eos_token,
93+
"add_prefix_space": add_prefix_space,
94+
"max_length": max_length,
95+
"continuing_subword_prefix": continuing_subword_prefix,
96+
"end_of_word_suffix": end_of_word_suffix,
97+
"trim_offsets": trim_offsets
98+
}
99+
super().__init__(tokenizer, parameters)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import numpy as np
16+
import os
17+
import unittest
18+
19+
import fast_tokenizer
20+
from fast_tokenizer import ClipFastTokenizer, models
21+
from paddlenlp.utils.downloader import get_path_from_url
22+
23+
24+
class TestClipFastTokenizer(unittest.TestCase):
25+
26+
def setUp(self):
27+
vocab_path = os.path.join(os.getcwd(), "vocab.json")
28+
merges_path = os.path.join(os.getcwd(), "merges.txt")
29+
if not os.path.exists(vocab_path):
30+
get_path_from_url(
31+
"http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/vocab.json",
32+
os.getcwd())
33+
if not os.path.exists(merges_path):
34+
get_path_from_url(
35+
"http://bj.bcebos.com/paddlenlp/models/community/openai/clip-vit-large-patch14/merges.txt",
36+
os.getcwd())
37+
vocab, merges = models.BPE.read_file(vocab_path, merges_path)
38+
self.tokenizer = ClipFastTokenizer(vocab, merges)
39+
self.expected_ids = [
40+
49406, 320, 1342, 272, 272, 335, 273, 273, 274, 16368, 13439, 2971,
41+
748, 531, 13610, 323, 1896, 8445, 323, 539, 320, 2368, 49407
42+
]
43+
self.expected_tokens = [
44+
"<|startoftext|>", "a</w>", "'ll</w>", "1</w>", "1</w>", "p</w>",
45+
"2</w>", "2</w>", "3</w>", "rf</w>", "âĺĨ</w>", "ho</w>", "!!</w>",
46+
"to</w>", "?'</w>", "d</w>", "'d</w>", "''</w>", "d</w>", "of</w>",
47+
"a</w>", "cat</w>", "<|endoftext|>"
48+
]
49+
self.input_text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
50+
51+
def test_encode(self):
52+
result = self.tokenizer.encode(self.input_text)
53+
self.assertEqual(result.tokens, self.expected_tokens)
54+
self.assertEqual(result.ids, self.expected_ids)

fast_tokenizer/run_build_py_lib.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ do
3333
echo "Compile with $core_num cores"
3434
cmake .. -DWITH_PYTHON=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
3535
make -j${core_num}
36-
if [[$? == 0]];
36+
if [[ $? == 0 ]];
3737
then
3838
echo "Successfully compile."
3939
else

0 commit comments

Comments
 (0)