Drop the last character in some fraction of batches when training a tokenizer. Addresses an issue where some words were being chopped up in Spanish because the tokenizer had never seen them and interpreted the last character as the end of the document #1401

AngledLuffa · AngledLuffa · commit 56350a0eebf4 · 2024-07-20T19:51:30.000-07:00
diff --git a/stanza/models/tokenization/data.py b/stanza/models/tokenization/data.py
@@ -275,6 +275,7 @@ def strings_starting(id_pair, offset=0, pad_len=self.args['max_seqlen']):
             sentences = [copy([x[offset:] for x in self.sentences[pid][sid]])]
 
             drop_sents = False if self.eval or (self.args.get('sent_drop_prob', 0) == 0) else (random.random() < self.args.get('sent_drop_prob', 0))
+            drop_last_char = False if self.eval or (self.args.get('last_char_drop_prob', 0) == 0) else (random.random() < self.args.get('last_char_drop_prob', 0))
             total_len = len(sentences[0][0])
 
             assert self.eval or total_len <= self.args['max_seqlen'], 'The maximum sequence length {} is less than that of the longest sentence length ({}) in the data, consider increasing it! {}'.format(self.args['max_seqlen'], total_len, ' '.join(["{}/{}".format(*x) for x in zip(self.sentences[pid][sid])]))
@@ -311,6 +312,15 @@ def strings_starting(id_pair, offset=0, pad_len=self.args['max_seqlen']):
                 cutoff = self.args['max_seqlen']
                 units, labels, feats, raw_units = units[:cutoff], labels[:cutoff], feats[:cutoff], raw_units[:cutoff]
 
+            if drop_last_char:  # can only happen in non-eval mode
+                if len(labels) > 1 and labels[-1] == 2 and labels[-2] in (1, 3):
+                    # training text ended with a sentence end position
+                    # and that word was a single character
+                    # and the previous character ended the word
+                    units, labels, feats, raw_units = units[:-1], labels[:-1], feats[:-1], raw_units[:-1]
+                    # word end -> sentence end, mwt end -> sentence mwt end
+                    labels[-1] = labels[-1] + 1
+
             return units, labels, feats, raw_units
 
         if eval_offsets is not None:
diff --git a/stanza/models/tokenizer.py b/stanza/models/tokenizer.py
@@ -70,6 +70,7 @@ def build_argparse():
     parser.add_argument('--feat_unit_dropout', type=float, default=0.33, help="The whole feature of units dropout probability")
     parser.add_argument('--tok_noise', type=float, default=0.02, help="Probability to induce noise to the input of the higher RNN")
     parser.add_argument('--sent_drop_prob', type=float, default=0.2, help="Probability to drop sentences at the end of batches during training uniformly at random.  Idea is to fake paragraph endings.")
+    parser.add_argument('--last_char_drop_prob', type=float, default=0.2, help="Probability to drop the last char of a block of text during training, uniformly at random.  Idea is to fake a document ending w/o sentence final punctuation, hopefully to avoid the tokenizer learning to always tokenize the last character as a period")
     parser.add_argument('--weight_decay', type=float, default=0.0, help="Weight decay")
     parser.add_argument('--max_seqlen', type=int, default=100, help="Maximum sequence length to consider at a time")
     parser.add_argument('--batch_size', type=int, default=32, help="Batch size to use")