Skip to content

Commit 05131d5

Browse files
linjiecccwj-Mcat
andauthored
Add test_tokenizer.py for ERNIE-Layout (#4353)
* add test_tokenizer * fix test_tokenizer * fix unittests * fix * fix * update ernie-layout tokenizer * triger ci --------- Co-authored-by: wj-Mcat <[email protected]>
1 parent 29bf84c commit 05131d5

File tree

2 files changed

+174
-1
lines changed

2 files changed

+174
-1
lines changed

paddlenlp/transformers/ernie_layout/tokenizer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@ def __init__(
104104

105105
self.tokens_to_ids["[MASK]"] = len(self.sp_model) + self.offset
106106
self.ids_to_tokens = {v: k for k, v in self.tokens_to_ids.items()}
107-
108107
self.SP_CHAR_MAPPING = {}
109108

110109
for ch in range(65281, 65375):
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
# Copyright 2020 The HuggingFace Team. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import unittest
17+
18+
from paddlenlp.transformers.ernie_layout.tokenizer import ErnieLayoutTokenizer
19+
from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
20+
21+
from ..test_tokenizer_common import TokenizerTesterMixin
22+
23+
24+
class ErnieLayoutEnglishTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
25+
26+
tokenizer_class = ErnieLayoutTokenizer
27+
space_between_special_tokens = True
28+
29+
def get_tokenizer(self, **kwargs) -> PretrainedTokenizer:
30+
return ErnieLayoutTokenizer.from_pretrained("ernie-layoutx-base-uncased", **kwargs)
31+
32+
def get_input_output_texts(self, tokenizer):
33+
input_text = "This is a test"
34+
output_text = "This is a test"
35+
return input_text, output_text
36+
37+
def test_convert_token_and_id(self):
38+
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
39+
token = "[CLS]"
40+
token_id = 0
41+
42+
self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
43+
self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
44+
45+
def test_full_tokenizer(self):
46+
tokenizer = self.get_tokenizer()
47+
48+
tokens = tokenizer.tokenize("This is a test")
49+
self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁test"])
50+
51+
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3293, 83, 10, 3034])
52+
53+
tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
54+
self.assertListEqual(
55+
tokens, ["▁I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fals", "é", "."]
56+
)
57+
ids = tokenizer.convert_tokens_to_ids(tokens)
58+
self.assertListEqual(ids, [87, 509, 103122, 23, 483, 13821, 4, 136, 903, 83, 84047, 446, 5])
59+
60+
back_tokens = tokenizer.convert_ids_to_tokens(ids)
61+
self.assertListEqual(back_tokens, tokens)
62+
63+
def test_clean_text(self):
64+
tokenizer = self.get_tokenizer()
65+
66+
# Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
67+
self.assertListEqual(
68+
[tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["▁Test"], ["▁", "\xad"], ["▁test"]]
69+
)
70+
71+
def test_sequence_builders(self):
72+
tokenizer = self.get_tokenizer()
73+
74+
text = tokenizer.encode("sequence builders", return_token_type_ids=None, add_special_tokens=False)["input_ids"]
75+
text_2 = tokenizer.encode("multi-sequence build", return_token_type_ids=None, add_special_tokens=False)[
76+
"input_ids"
77+
]
78+
79+
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
80+
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
81+
82+
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
83+
assert encoded_pair == [tokenizer.cls_token_id] + text + [
84+
tokenizer.sep_token_id,
85+
tokenizer.sep_token_id,
86+
] + text_2 + [tokenizer.sep_token_id]
87+
88+
def test_add_tokens(self):
89+
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
90+
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
91+
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
92+
93+
vocab_size = len(tokenizer)
94+
self.assertEqual(tokenizer.add_tokens(""), 0)
95+
self.assertEqual(tokenizer.add_tokens("testoken"), 1)
96+
self.assertEqual(tokenizer.add_tokens(["testoken1", "testtoken2"]), 2)
97+
self.assertEqual(len(tokenizer), vocab_size + 3)
98+
99+
self.assertEqual(tokenizer.add_special_tokens({}), 0)
100+
self.assertRaises(
101+
AssertionError, tokenizer.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
102+
)
103+
self.assertEqual(tokenizer.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
104+
self.assertEqual(
105+
tokenizer.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
106+
)
107+
self.assertIn("<testtoken3>", tokenizer.special_tokens_map["additional_special_tokens"])
108+
self.assertIsInstance(tokenizer.special_tokens_map["additional_special_tokens"], list)
109+
self.assertGreaterEqual(len(tokenizer.special_tokens_map["additional_special_tokens"]), 2)
110+
111+
self.assertEqual(len(tokenizer), vocab_size + 6)
112+
113+
def test_add_tokens_tokenizer(self):
114+
tokenizers = self.get_tokenizers(do_lower_case=False)
115+
for tokenizer in tokenizers:
116+
with self.subTest(f"{tokenizer.__class__.__name__}"):
117+
vocab_size = tokenizer.vocab_size
118+
all_size = len(tokenizer)
119+
120+
self.assertNotEqual(vocab_size, 0)
121+
122+
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
123+
added_toks = tokenizer.add_tokens(new_toks)
124+
vocab_size_2 = tokenizer.vocab_size
125+
all_size_2 = len(tokenizer)
126+
127+
self.assertNotEqual(vocab_size_2, 0)
128+
self.assertEqual(vocab_size, vocab_size_2)
129+
self.assertEqual(added_toks, len(new_toks))
130+
self.assertEqual(all_size_2, all_size + len(new_toks))
131+
132+
tokens = tokenizer.encode(
133+
"aaaaa bbbbbb low cccccccccdddddddd l", return_token_type_ids=None, add_special_tokens=False
134+
)["input_ids"]
135+
self.assertGreaterEqual(len(tokens), 4)
136+
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
137+
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
138+
139+
def test_padding_to_multiple_of(self):
140+
tokenizers = self.get_tokenizers(model_max_length=64)
141+
for tokenizer in tokenizers:
142+
with self.subTest(f"{tokenizer.__class__.__name__}"):
143+
if tokenizer.pad_token is None:
144+
self.skipTest("No padding token.")
145+
else:
146+
empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
147+
normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
148+
for key, value in empty_tokens.items():
149+
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
150+
for key, value in normal_tokens.items():
151+
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
152+
153+
normal_tokens = tokenizer("This", pad_to_multiple_of=8)
154+
for key, value in normal_tokens.items():
155+
self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
156+
157+
# Should also work with truncation
158+
normal_tokens = tokenizer("This", padding=True, truncation=True, pad_to_multiple_of=8)
159+
for key, value in normal_tokens.items():
160+
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
161+
162+
# truncation to something which is not a multiple of pad_to_multiple_of raises an error
163+
self.assertRaises(
164+
ValueError,
165+
tokenizer.__call__,
166+
"This",
167+
padding=True,
168+
truncation=True,
169+
max_length=12,
170+
pad_to_multiple_of=8,
171+
)
172+
173+
def test_token_type_ids(self):
174+
self.skipTest("Ernie-Layout model doesn't have token_type embedding. so skip this test")

0 commit comments

Comments
 (0)