20
20
# pylint: disable=consider-iterating-dictionary
21
21
22
22
"""Vocabulary."""
23
- from __future__ import absolute_import
24
- from __future__ import print_function
25
-
26
23
__all__ = ['Vocab' ]
27
24
28
25
import collections
29
26
import json
27
+ import typing
30
28
import uuid
31
29
import warnings
32
30
33
31
from mxnet import nd
34
32
35
- from ..data .utils import DefaultLookupDict , count_tokens
36
33
from .. import _constants as C
37
34
from .. import embedding as emb
35
+ from ..data .utils import DefaultLookupDict , Counter , count_tokens
38
36
39
37
UNK_IDX = 0
40
38
@@ -44,38 +42,38 @@ class Vocab(object):
44
42
45
43
Parameters
46
44
----------
47
- counter : Counter or None, default None
45
+ counter
48
46
Counts text token frequencies in the text data. Its keys will be indexed according to
49
47
frequency thresholds such as `max_size` and `min_freq`. Keys of `counter`,
50
48
`unknown_token`, and values of `reserved_tokens` must be of the same hashable type.
51
49
Examples: str, int, and tuple.
52
- max_size : None or int, default None
50
+ max_size
53
51
The maximum possible number of the most frequent tokens in the keys of `counter` that can be
54
52
indexed. Note that this argument does not count any token from `reserved_tokens`. Suppose
55
53
that there are different keys of `counter` whose frequency are the same, if indexing all of
56
54
them will exceed this argument value, such keys will be indexed one by one according to
57
55
their __cmp__() order until the frequency threshold is met. If this argument is None or
58
56
larger than its largest possible value restricted by `counter` and `reserved_tokens`, this
59
57
argument has no effect.
60
- min_freq : int, default 1
58
+ min_freq
61
59
The minimum frequency required for a token in the keys of `counter` to be indexed.
62
- unknown_token : hashable object or None, default '<unk>'
60
+ unknown_token
63
61
The representation for any unknown token. If `unknown_token` is not
64
62
`None`, looking up any token that is not part of the vocabulary and
65
63
thus considered unknown will return the index of `unknown_token`. If
66
64
None, looking up an unknown token will result in `KeyError`.
67
- padding_token : hashable object or None, default '<pad>'
65
+ padding_token
68
66
The representation for the special token of padding token.
69
- bos_token : hashable object or None, default '<bos>'
67
+ bos_token
70
68
The representation for the special token of beginning-of-sequence token.
71
- eos_token : hashable object or None, default '<eos>'
69
+ eos_token
72
70
The representation for the special token of end-of-sequence token.
73
- reserved_tokens : list of hashable objects or None, default None
71
+ reserved_tokens
74
72
A list specifying additional tokens to be added to the vocabulary.
75
73
`reserved_tokens` must not contain the value of `unknown_token` or
76
74
duplicate tokens. It must neither contain special tokens specified via
77
75
keyword arguments.
78
- token_to_idx : dict mapping tokens (hashable objects) to int or None, default None
76
+ token_to_idx
79
77
If not `None`, specifies the indices of tokens to be used by the
80
78
vocabulary. Each token in `token_to_index` must be part of the Vocab
81
79
and each index can only be associated with a single token.
@@ -175,9 +173,14 @@ class Vocab(object):
175
173
176
174
"""
177
175
178
- def __init__ (self , counter = None , max_size = None , min_freq = 1 , unknown_token = C .UNK_TOKEN ,
179
- padding_token = C .PAD_TOKEN , bos_token = C .BOS_TOKEN , eos_token = C .EOS_TOKEN ,
180
- reserved_tokens = None , token_to_idx = None , ** kwargs ):
176
+ def __init__ (self , counter : typing .Optional [Counter ] = None ,
177
+ max_size : typing .Optional [int ] = None , min_freq : int = 1 ,
178
+ unknown_token : typing .Optional [typing .Hashable ] = C .UNK_TOKEN ,
179
+ padding_token : typing .Optional [typing .Hashable ] = C .PAD_TOKEN ,
180
+ bos_token : typing .Optional [typing .Hashable ] = C .BOS_TOKEN ,
181
+ eos_token : typing .Optional [typing .Hashable ] = C .EOS_TOKEN ,
182
+ reserved_tokens : typing .Optional [typing .List [typing .Hashable ]] = None ,
183
+ token_to_idx : typing .Optional [typing .Dict [typing .Hashable , int ]] = None , ** kwargs ):
181
184
182
185
# Sanity checks.
183
186
assert min_freq > 0 , '`min_freq` must be set to a positive value.'
0 commit comments