-
Notifications
You must be signed in to change notification settings - Fork 529
Add BERTTokenizer for pre-trained SentencePiece model #669
Changes from 2 commits
b04a119
47de135
2d30c14
efb37d6
b35a5de
3a413db
ceda2d8
a5aa261
0d81bdd
3ca1853
4612941
9d71e7c
4a4d151
6789600
c95f6f6
0dc73fb
d6001f9
aab5b27
87a4b5f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1011,6 +1011,95 @@ def convert_tokens_to_ids(self, tokens): | |
return self.vocab.to_indices(tokens) | ||
|
||
|
||
class BERTSPTokenizer(BERTTokenizer): | ||
"""End-to-end SentencePiece tokenization for BERT models. | ||
It works best with BERTSentenceTransform() | ||
|
||
Parameters | ||
---------- | ||
path : str | ||
Path to the pre-trained subword tokenization model. | ||
vocab : gluonnlp.Vocab or None, default None | ||
Vocabulary for the corpus. | ||
If vocab == None, then pre-trained subword tokenization model will be used for indexing. | ||
num_best : int, default 0 | ||
A scalar for sampling subwords. If num_best = {0,1}, no sampling is performed. | ||
If num_best > 1, then samples from the num_best results. | ||
If num_best < 0, then assume that num_best is infinite and | ||
samples from the all hypothesis (lattice) using forward-filtering-and-backward-sampling | ||
algorithm. | ||
alpha : float | ||
A scalar for a smoothing parameter. Inverse temperature for probability rescaling. | ||
lower : bool, default True | ||
whether the text strips accents and convert to lower case. | ||
If you use the BERT pre-training model, | ||
lower is set to Flase when using the cased model, | ||
otherwise it is set to True. | ||
max_input_chars_per_word : int, default 200 | ||
|
||
Examples | ||
-------- | ||
>>> url = 'http://repo.mxnet.io/gluon/dataset/vocab/test-0690baed.bpe' | ||
>>> f = gluon.utils.download(url, overwrite=True) | ||
-etc- | ||
>>> sp_tokenizer = BERTSPTokenizer(f, lower=True) | ||
>>> sentence = 'This is a very awesome, life-changing sentence.' | ||
>>> sp_tokenizer(sentence) | ||
['▁This', '▁is', '▁a', '▁very', '▁awesome', ',', '▁life', '-', 'ch', 'anging', '▁sentence', '.'] | ||
|
||
""" | ||
|
||
def __init__(self, | ||
path, | ||
vocab=None, | ||
num_best=0, | ||
alpha=1.0, | ||
lower=True, | ||
max_input_chars_per_word=200): | ||
super(BERTSPTokenizer, self).__init__(vocab, lower, | ||
max_input_chars_per_word) | ||
self._path = path | ||
self._num_best = num_best | ||
self._alpha = alpha | ||
self.sentencepiece = None | ||
|
||
def _activate_sp(self): | ||
self.sentencepiece = SentencepieceTokenizer( | ||
self._path, self._num_best, self._alpha) | ||
|
||
def _tokenize_wordpiece(self, text): | ||
"""Tokenizes a piece of text into its word pieces. | ||
|
||
This use Google's SentencePiece tokenizer model file | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I am working. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @eric-haibin-lin How can I upload test sentencepiece model for unittest like test_sentencepiece_tokenizer_subword_regularization ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @haven-jeon I can help with that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I uploaded the file to apache-mxnet/gluon/dataset/vocab/test-682b5d15.bpe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I am working. |
||
|
||
For example: | ||
input = "unaffable" | ||
output = ["▁un", "aff", "able"] | ||
|
||
Args: | ||
text: A single token or whitespace separated tokens. This should have | ||
already been passed through `BERTBasicTokenizer. | ||
|
||
Returns: | ||
A list of sentencepieced tokens. | ||
""" | ||
#Swig object can not be pickled when multiprocessing. | ||
if self.sentencepiece is None: | ||
self._activate_sp() | ||
output_tokens = self.sentencepiece(text) | ||
return output_tokens | ||
|
||
def convert_tokens_to_ids(self, tokens): | ||
"""Converts a sequence of tokens into ids using the vocab or sentencepiece model.""" | ||
if self.vocab is None: | ||
if self.sentencepiece is None: | ||
self._activate_sp() | ||
return self.sentencepiece._processor.encode_as_ids( #pylint: disable=protected-access | ||
' '.join(tokens)) | ||
else: | ||
return self.vocab.to_indices(tokens) | ||
|
||
|
||
class BERTSentenceTransform(object): | ||
r"""BERT style data transformation. | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.