Skip to content

Commit fcb477d

Browse files
authored
[BugFix]Fix ernie tokenizer unittest (#3423)
* fix bert unittest bug * change token_labels -> sequence_labels * update ernie tokenizer max_input_size
1 parent c5c06ce commit fcb477d

File tree

1 file changed

+40
-44
lines changed

1 file changed

+40
-44
lines changed

paddlenlp/transformers/ernie/tokenizer.py

Lines changed: 40 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,46 @@
2727
__all__ = ['ErnieTokenizer', 'ErnieTinyTokenizer']
2828

2929
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
30-
"ernie-doc-base-en": 512,
31-
"ernie-doc-base-zh": 512
30+
"ernie-1.0": 513,
31+
"ernie-1.0-base-zh": 513,
32+
"ernie-1.0-base-zh-cw": 512,
33+
"ernie-1.0-large-zh-cw": 512,
34+
"ernie-tiny": 600,
35+
"ernie-2.0-base-zh": 513,
36+
"ernie-2.0-large-zh": 512,
37+
"ernie-2.0-base-en": 512,
38+
"ernie-2.0-base-en-finetuned-squad": 512,
39+
"ernie-2.0-large-en": 512,
40+
"ernie-gen-base-en": 1024,
41+
"ernie-gen-large-en": 1024,
42+
"ernie-gen-large-en-430g": 1024,
43+
"rocketqa-zh-dureader-query-encoder": 513,
44+
"rocketqa-zh-dureader-para-encoder": 513,
45+
"rocketqa-v1-marco-query-encoder": 512,
46+
"rocketqa-v1-marco-para-encoder": 512,
47+
"rocketqa-zh-dureader-cross-encoder": 513,
48+
"rocketqa-v1-marco-cross-encoder": 512,
49+
"ernie-3.0-base-zh": 2048,
50+
"ernie-3.0-xbase-zh": 2048,
51+
"ernie-3.0-medium-zh": 2048,
52+
"ernie-3.0-mini-zh": 2048,
53+
"ernie-3.0-micro-zh": 2048,
54+
"ernie-3.0-nano-zh": 2048,
55+
"rocketqa-zh-base-query-encoder": 2048,
56+
"rocketqa-zh-base-para-encoder": 2048,
57+
"rocketqa-zh-medium-query-encoder": 2048,
58+
"rocketqa-zh-medium-para-encoder": 2048,
59+
"rocketqa-zh-mini-query-encoder": 2048,
60+
"rocketqa-zh-mini-para-encoder": 2048,
61+
"rocketqa-zh-micro-query-encoder": 2048,
62+
"rocketqa-zh-micro-para-encoder": 2048,
63+
"rocketqa-zh-nano-query-encoder": 2048,
64+
"rocketqa-zh-nano-para-encoder": 2048,
65+
"rocketqa-base-cross-encoder": 2048,
66+
"rocketqa-medium-cross-encoder": 2048,
67+
"rocketqa-mini-cross-encoder": 2048,
68+
"rocketqa-micro-cross-encoder": 2048,
69+
"rocketqa-nano-cross-encoder": 2048
3270
}
3371

3472

@@ -288,48 +326,6 @@ class ErnieTokenizer(PretrainedTokenizer):
288326
"do_lower_case": True
289327
},
290328
}
291-
max_model_input_sizes = {
292-
"ernie-1.0": 513,
293-
"ernie-1.0-base-zh": 513,
294-
"ernie-1.0-base-zh-cw": 512,
295-
"ernie-1.0-large-zh-cw": 512,
296-
"ernie-tiny": 600,
297-
"ernie-2.0-base-zh": 513,
298-
"ernie-2.0-large-zh": 512,
299-
"ernie-2.0-base-en": 512,
300-
"ernie-2.0-base-en-finetuned-squad": 512,
301-
"ernie-2.0-large-en": 512,
302-
"ernie-gen-base-en": 1024,
303-
"ernie-gen-large-en": 1024,
304-
"ernie-gen-large-en-430g": 1024,
305-
"rocketqa-zh-dureader-query-encoder": 513,
306-
"rocketqa-zh-dureader-para-encoder": 513,
307-
"rocketqa-v1-marco-query-encoder": 512,
308-
"rocketqa-v1-marco-para-encoder": 512,
309-
"rocketqa-zh-dureader-cross-encoder": 513,
310-
"rocketqa-v1-marco-cross-encoder": 512,
311-
"ernie-3.0-base-zh": 2048,
312-
"ernie-3.0-xbase-zh": 2048,
313-
"ernie-3.0-medium-zh": 2048,
314-
"ernie-3.0-mini-zh": 2048,
315-
"ernie-3.0-micro-zh": 2048,
316-
"ernie-3.0-nano-zh": 2048,
317-
"rocketqa-zh-base-query-encoder": 2048,
318-
"rocketqa-zh-base-para-encoder": 2048,
319-
"rocketqa-zh-medium-query-encoder": 2048,
320-
"rocketqa-zh-medium-para-encoder": 2048,
321-
"rocketqa-zh-mini-query-encoder": 2048,
322-
"rocketqa-zh-mini-para-encoder": 2048,
323-
"rocketqa-zh-micro-query-encoder": 2048,
324-
"rocketqa-zh-micro-para-encoder": 2048,
325-
"rocketqa-zh-nano-query-encoder": 2048,
326-
"rocketqa-zh-nano-para-encoder": 2048,
327-
"rocketqa-base-cross-encoder": 2048,
328-
"rocketqa-medium-cross-encoder": 2048,
329-
"rocketqa-mini-cross-encoder": 2048,
330-
"rocketqa-micro-cross-encoder": 2048,
331-
"rocketqa-nano-cross-encoder": 2048,
332-
}
333329

334330
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
335331

0 commit comments

Comments
 (0)