Skip to content

Commit 56350a0

Browse files
committed
Drop the last character in some fraction of batches when training a tokenizer. Addresses an issue where some words were being chopped up in Spanish because the tokenizer had never seen them and interpreted the last character as the end of the document #1401
1 parent 9a1a50e commit 56350a0

File tree

2 files changed

+11
-0
lines changed

2 files changed

+11
-0
lines changed

stanza/models/tokenization/data.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ def strings_starting(id_pair, offset=0, pad_len=self.args['max_seqlen']):
275275
sentences = [copy([x[offset:] for x in self.sentences[pid][sid]])]
276276

277277
drop_sents = False if self.eval or (self.args.get('sent_drop_prob', 0) == 0) else (random.random() < self.args.get('sent_drop_prob', 0))
278+
drop_last_char = False if self.eval or (self.args.get('last_char_drop_prob', 0) == 0) else (random.random() < self.args.get('last_char_drop_prob', 0))
278279
total_len = len(sentences[0][0])
279280

280281
assert self.eval or total_len <= self.args['max_seqlen'], 'The maximum sequence length {} is less than that of the longest sentence length ({}) in the data, consider increasing it! {}'.format(self.args['max_seqlen'], total_len, ' '.join(["{}/{}".format(*x) for x in zip(self.sentences[pid][sid])]))
@@ -311,6 +312,15 @@ def strings_starting(id_pair, offset=0, pad_len=self.args['max_seqlen']):
311312
cutoff = self.args['max_seqlen']
312313
units, labels, feats, raw_units = units[:cutoff], labels[:cutoff], feats[:cutoff], raw_units[:cutoff]
313314

315+
if drop_last_char: # can only happen in non-eval mode
316+
if len(labels) > 1 and labels[-1] == 2 and labels[-2] in (1, 3):
317+
# training text ended with a sentence end position
318+
# and that word was a single character
319+
# and the previous character ended the word
320+
units, labels, feats, raw_units = units[:-1], labels[:-1], feats[:-1], raw_units[:-1]
321+
# word end -> sentence end, mwt end -> sentence mwt end
322+
labels[-1] = labels[-1] + 1
323+
314324
return units, labels, feats, raw_units
315325

316326
if eval_offsets is not None:

stanza/models/tokenizer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def build_argparse():
7070
parser.add_argument('--feat_unit_dropout', type=float, default=0.33, help="The whole feature of units dropout probability")
7171
parser.add_argument('--tok_noise', type=float, default=0.02, help="Probability to induce noise to the input of the higher RNN")
7272
parser.add_argument('--sent_drop_prob', type=float, default=0.2, help="Probability to drop sentences at the end of batches during training uniformly at random. Idea is to fake paragraph endings.")
73+
parser.add_argument('--last_char_drop_prob', type=float, default=0.2, help="Probability to drop the last char of a block of text during training, uniformly at random. Idea is to fake a document ending w/o sentence final punctuation, hopefully to avoid the tokenizer learning to always tokenize the last character as a period")
7374
parser.add_argument('--weight_decay', type=float, default=0.0, help="Weight decay")
7475
parser.add_argument('--max_seqlen', type=int, default=100, help="Maximum sequence length to consider at a time")
7576
parser.add_argument('--batch_size', type=int, default=32, help="Batch size to use")

0 commit comments

Comments
 (0)