@@ -55,14 +55,13 @@ def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
55
55
def get_acceptance_sampler (
56
56
posterior_threshold : float = 0.03 ,
57
57
posterior_alpha : float = 0.9 ,
58
- disable_bonus_tokens : bool = False ,
59
58
strict_mode : bool = False ,
60
59
) -> TypicalAcceptanceSampler :
61
60
"""
62
61
Initializes and returns a TypicalAcceptanceSampler.
63
62
"""
64
63
return TypicalAcceptanceSampler (posterior_threshold , posterior_alpha ,
65
- disable_bonus_tokens , strict_mode )
64
+ strict_mode )
66
65
67
66
68
67
@pytest .mark .parametrize ("k" , list (range (1 , 6 )))
@@ -154,29 +153,25 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
154
153
155
154
156
155
@pytest .mark .parametrize ("seed" , list (range (10 )))
157
- @pytest .mark .parametrize ("disable_bonus_tokens" , [True , False ])
158
156
@pytest .mark .parametrize ("device" , CUDA_DEVICES )
159
157
@torch .inference_mode ()
160
158
def test_uniform_target_distribution_accepts_all_tokens (
161
- seed : int , disable_bonus_tokens : bool , device : str ):
159
+ seed : int , device : str ):
162
160
"""
163
161
Test the TypicalAcceptanceSampler with a uniform target probability
164
162
distribution.
165
163
166
164
This test verifies that when provided with a uniform target probability
167
165
distribution, the TypicalAcceptanceSampler accepts all draft tokens. The
168
166
entropy of the uniform target distribution being high should lead to all
169
- draft tokens being accepted. The test also ensures that the behavior
170
- regarding bonus tokens is consistent with the `disable_bonus_tokens`
171
- flag.
167
+ draft tokens being accepted.
172
168
"""
173
169
set_random_seed (seed )
174
170
k = 3
175
171
batch_size = 5
176
172
vocab_size = 30_000
177
173
torch .set_default_device (device )
178
- typical_acceptance_sampler = get_acceptance_sampler (
179
- strict_mode = True , disable_bonus_tokens = disable_bonus_tokens )
174
+ typical_acceptance_sampler = get_acceptance_sampler (strict_mode = True )
180
175
typical_acceptance_sampler .init_gpu_tensors (device = device )
181
176
target_with_bonus_probs = torch .rand (batch_size ,
182
177
k + 1 ,
@@ -200,21 +195,15 @@ def test_uniform_target_distribution_accepts_all_tokens(
200
195
# should lead to all draft tokens being accepted. Verify that.
201
196
assert output_token_ids .shape [0 ] == batch_size
202
197
assert output_token_ids .shape [1 ] == (k + 1 )
203
- if disable_bonus_tokens :
204
- assert torch .all (output_token_ids [:, - 1 ] == - 1 )
205
- else :
206
- assert torch .all (output_token_ids [:, - 1 ] == bonus_token_ids .squeeze ())
198
+ assert torch .all (output_token_ids [:, - 1 ] == bonus_token_ids .squeeze ())
207
199
208
200
assert torch .all (output_token_ids [:, :k ] == draft_token_ids )
209
201
210
202
211
203
@pytest .mark .parametrize ("seed" , list (range (10 )))
212
- @pytest .mark .parametrize ("disable_bonus_tokens" , [True , False ])
213
204
@pytest .mark .parametrize ("device" , CUDA_DEVICES )
214
205
@torch .inference_mode ()
215
- def test_temperature_zero_target_distribution (seed : int ,
216
- disable_bonus_tokens : bool ,
217
- device : str ):
206
+ def test_temperature_zero_target_distribution (seed : int , device : str ):
218
207
"""
219
208
Test the TypicalAcceptanceSampler with a zero-temperature target
220
209
probability distribution.
@@ -232,8 +221,7 @@ def test_temperature_zero_target_distribution(seed: int,
232
221
vocab_size = 30_000
233
222
torch .set_default_device (device )
234
223
235
- typical_acceptance_sampler = get_acceptance_sampler (
236
- strict_mode = True , disable_bonus_tokens = disable_bonus_tokens )
224
+ typical_acceptance_sampler = get_acceptance_sampler (strict_mode = True )
237
225
typical_acceptance_sampler .init_gpu_tensors (device = device )
238
226
# Simulate temperature 0 probability distribution for target probabilities
239
227
# and create target probabilities such that only 1 token id has
@@ -267,11 +255,9 @@ def test_temperature_zero_target_distribution(seed: int,
267
255
268
256
269
257
@pytest .mark .parametrize ("seed" , list (range (10 )))
270
- @pytest .mark .parametrize ("disable_bonus_tokens" , [True , False ])
271
258
@pytest .mark .parametrize ("device" , CUDA_DEVICES )
272
259
@torch .inference_mode ()
273
- def test_mixed_target_distribution (seed : int , disable_bonus_tokens : bool ,
274
- device : str ):
260
+ def test_mixed_target_distribution (seed : int , device : str ):
275
261
"""
276
262
Test the TypicalAcceptanceSampler with a mixed target probability
277
263
distribution.
@@ -285,16 +271,13 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
285
271
with a probability of 1.0 is accepted, and all other tokens are rejected.
286
272
- For sequences with a uniform distribution, all draft tokens are
287
273
accepted.
288
- - When `disable_bonus_tokens` is False, the bonus tokens are also accepted
289
- for sequences with a uniform distribution.
290
274
"""
291
275
set_random_seed (seed )
292
276
k = 3
293
277
batch_size = 4
294
278
vocab_size = 30_000
295
279
torch .set_default_device (device )
296
- typical_acceptance_sampler = get_acceptance_sampler (
297
- strict_mode = True , disable_bonus_tokens = disable_bonus_tokens )
280
+ typical_acceptance_sampler = get_acceptance_sampler (strict_mode = True )
298
281
typical_acceptance_sampler .init_gpu_tensors (device = device )
299
282
# For sequences 0 and 2 set the distribution to a temperature
300
283
# zero distribution. For sequences 1 and 3 set it to a uniform
@@ -328,21 +311,16 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
328
311
0 ]))
329
312
# For sequences 1 and 3 verify that all tokens are accepted since the
330
313
# target probability distribution is uniform. In addition verify that
331
- # if disable_bonus_tokens is false then we also accept the bonus tokens.
314
+ # we also accept the bonus tokens.
332
315
assert torch .all (
333
316
output_token_ids [[1 , 3 ], :- 1 ] == draft_token_ids [[1 , 3 ], :])
334
- if disable_bonus_tokens :
335
- assert torch .all (output_token_ids [[1 , 3 ], - 1 ] == - 1 )
336
- else :
337
- assert torch .all (output_token_ids [[1 , 3 ], - 1 ] != - 1 )
317
+ assert torch .all (output_token_ids [[1 , 3 ], - 1 ] != - 1 )
338
318
339
319
340
320
@pytest .mark .parametrize ("seed" , list (range (10 )))
341
- @pytest .mark .parametrize ("disable_bonus_tokens" , [True , False ])
342
321
@pytest .mark .parametrize ("device" , CUDA_DEVICES )
343
322
@torch .inference_mode ()
344
- def test_accept_tokens_partially (seed : int , disable_bonus_tokens : bool ,
345
- device : str ):
323
+ def test_accept_tokens_partially (seed : int , device : str ):
346
324
"""
347
325
Test the TypicalAcceptanceSampler's behavior when only a subset of draft
348
326
tokens should be accepted.
@@ -362,8 +340,7 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
362
340
batch_size = 1
363
341
vocab_size = 30_000
364
342
torch .set_default_device (device )
365
- typical_acceptance_sampler = get_acceptance_sampler (
366
- strict_mode = True , disable_bonus_tokens = disable_bonus_tokens )
343
+ typical_acceptance_sampler = get_acceptance_sampler (strict_mode = True )
367
344
typical_acceptance_sampler .init_gpu_tensors (device = device )
368
345
# Create a temperature zero target probability distribution and ensure
369
346
# all draft token ids correspond to the tokens with 1.0 probability.
@@ -384,10 +361,7 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
384
361
assert output_token_ids .shape [0 ] == batch_size
385
362
assert output_token_ids .shape [1 ] == (k + 1 )
386
363
assert torch .all (output_token_ids [:, 0 :- 1 ] == draft_token_ids )
387
- if disable_bonus_tokens :
388
- assert torch .all (output_token_ids [:, - 1 ] == - 1 )
389
- else :
390
- assert torch .all (output_token_ids [:, - 1 ] == bonus_token_ids )
364
+ assert torch .all (output_token_ids [:, - 1 ] == bonus_token_ids )
391
365
# Next only keep the first 2 draft tokens same as the zero temperature
392
366
# tokens. For the remaining 3 choose some other tokens. In the
393
367
# response we will expect the first 2 tokens to be the same as the
@@ -408,12 +382,9 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
408
382
409
383
410
384
@pytest .mark .parametrize ("seed" , list (range (1 )))
411
- @pytest .mark .parametrize ("disable_bonus_tokens" , [True , False ])
412
385
@pytest .mark .parametrize ("device" , CUDA_DEVICES )
413
386
@torch .inference_mode ()
414
- def test_accept_tokens_set_non_default_posteriors (seed : int ,
415
- disable_bonus_tokens : bool ,
416
- device : str ):
387
+ def test_accept_tokens_set_non_default_posteriors (seed : int , device : str ):
417
388
"""
418
389
Test the TypicalAcceptanceSampler with custom posterior thresholds and
419
390
alpha values. This test verifies that by modifying the posterior
@@ -425,8 +396,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
425
396
batch_size = 1
426
397
vocab_size = 30_000
427
398
torch .set_default_device (device )
428
- typical_acceptance_sampler = get_acceptance_sampler (
429
- strict_mode = True , disable_bonus_tokens = disable_bonus_tokens )
399
+ typical_acceptance_sampler = get_acceptance_sampler (strict_mode = True )
430
400
typical_acceptance_sampler .init_gpu_tensors (device = device )
431
401
# Simulate temperature 0 probability distribution for target
432
402
# probabilities and create target probabilities such that only 1 token
@@ -457,10 +427,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
457
427
# now accept even draft tokens with very low probability in the
458
428
# target distribution. Simulate and verify the same.
459
429
typical_acceptance_sampler = TypicalAcceptanceSampler (
460
- strict_mode = True ,
461
- disable_bonus_tokens = disable_bonus_tokens ,
462
- posterior_threshold = 0.0 ,
463
- posterior_alpha = 0.0 )
430
+ strict_mode = True , posterior_threshold = 0.0 , posterior_alpha = 0.0 )
464
431
typical_acceptance_sampler .init_gpu_tensors (device = device )
465
432
output_token_ids = typical_acceptance_sampler (
466
433
target_probs ,
@@ -470,18 +437,13 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
470
437
assert output_token_ids .shape [0 ] == batch_size
471
438
assert output_token_ids .shape [1 ] == (k + 1 )
472
439
assert torch .all (output_token_ids [:, 0 :- 1 ] == draft_token_ids )
473
- if disable_bonus_tokens :
474
- assert torch .all (output_token_ids [:, - 1 ] == - 1 )
475
- else :
476
- assert torch .all (output_token_ids [:, - 1 ] == bonus_token_ids )
440
+ assert torch .all (output_token_ids [:, - 1 ] == bonus_token_ids )
477
441
478
442
479
443
@pytest .mark .parametrize ("seed" , list (range (10 )))
480
- @pytest .mark .parametrize ("disable_bonus_tokens" , [True , False ])
481
444
@pytest .mark .parametrize ("device" , CUDA_DEVICES )
482
445
@torch .inference_mode ()
483
- def test_replacement_token_ids (seed : int , disable_bonus_tokens : bool ,
484
- device : str ):
446
+ def test_replacement_token_ids (seed : int , device : str ):
485
447
"""
486
448
Test the TypicalAcceptanceSampler's method for generating
487
449
replacement token IDs.
@@ -497,8 +459,7 @@ def test_replacement_token_ids(seed: int, disable_bonus_tokens: bool,
497
459
batch_size = 5
498
460
vocab_size = 30_000
499
461
torch .set_default_device (device )
500
- typical_acceptance_sampler = get_acceptance_sampler (
501
- strict_mode = True , disable_bonus_tokens = disable_bonus_tokens )
462
+ typical_acceptance_sampler = get_acceptance_sampler (strict_mode = True )
502
463
typical_acceptance_sampler .init_gpu_tensors (device = device )
503
464
target_probs = torch .rand (batch_size , k , vocab_size , dtype = torch .float32 )
504
465
expected_replacement_tokens = - torch .ones (
0 commit comments