Skip to content

Perf improvement 16% by removing offsets. #1587

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions bindings/python/benches/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,12 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
mergeable_ranks=mergeable_ranks,
special_tokens=special_tokens,
)
enc.encode("warmup")
out = enc.encode("This is a test")

hf_enc = Tokenizer.from_pretrained(model)
out2 = hf_enc.encode("This is a test", add_special_tokens=False).ids

assert out == out2, "sanity check"

start = time.perf_counter_ns()
enc.encode_ordinary_batch(documents, num_threads=num_threads)
Expand All @@ -69,11 +74,9 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
print(f"tiktoken \t{readable_size} / s")

hf_enc = Tokenizer.from_pretrained(model)
hf_enc.encode("warmup")

start = time.perf_counter_ns()
hf_enc.encode_batch(documents)
hf_enc.encode_batch_fast(documents)
end = time.perf_counter_ns()
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
print(f"huggingface \t{readable_size} / s")
Expand All @@ -82,8 +85,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
dataset_xnli = load_dataset(dataset, dataset_config)

# input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths
input_lengths = [(10_000, False, True), (10_000, False, False)]
input_lengths = [(10, False, True), (10_000, False, True), (10_000, False, False)]

for num_threads in threads:
for length, fuse, long in input_lengths:
Expand Down
36 changes: 36 additions & 0 deletions bindings/python/py_src/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,42 @@ class Tokenizer:
"""
pass

def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
"""
Encode the given batch of inputs. This method is faster than `encode_batch`
because it doesn't keep track of offsets, they will be all zeros.

Example:
Here are some examples of the inputs that are accepted::

encode_batch_fast([
"A single sequence",
("A tuple with a sequence", "And its pair"),
[ "A", "pre", "tokenized", "sequence" ],
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
])

Args:
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
A list of single sequences or pair sequences to encode. Each sequence
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
argument:

- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`

is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Whether the input is already pre-tokenized

add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add the special tokens

Returns:
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch

"""
pass

@property
def encode_special_tokens(self):
"""
Expand Down
61 changes: 61 additions & 0 deletions bindings/python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1055,6 +1055,67 @@ impl PyTokenizer {
})
}

/// Encode the given batch of inputs. This method is faster than `encode_batch`
/// because it doesn't keep track of offsets, they will be all zeros.
///
/// Example:
/// Here are some examples of the inputs that are accepted::
///
/// encode_batch_fast([
/// "A single sequence",
/// ("A tuple with a sequence", "And its pair"),
/// [ "A", "pre", "tokenized", "sequence" ],
/// ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
/// ])
///
/// Args:
/// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
/// A list of single sequences or pair sequences to encode. Each sequence
/// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
/// argument:
///
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
///
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
/// Whether the input is already pre-tokenized
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add the special tokens
///
/// Returns:
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
///
#[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
#[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
fn encode_batch_fast(
&self,
py: Python<'_>,
input: Vec<&PyAny>,
is_pretokenized: bool,
add_special_tokens: bool,
) -> PyResult<Vec<PyEncoding>> {
let input: Vec<tk::EncodeInput> = input
.into_iter()
.map(|o| {
let input: tk::EncodeInput = if is_pretokenized {
o.extract::<PreTokenizedEncodeInput>()?.into()
} else {
o.extract::<TextEncodeInput>()?.into()
};
Ok(input)
})
.collect::<PyResult<Vec<tk::EncodeInput>>>()?;
py.allow_threads(|| {
ToPyResult(
self.tokenizer
.encode_batch_fast(input, add_special_tokens)
.map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
)
.into()
})
}

/// Decode the given list of ids back to a string
///
/// This is used to decode anything coming back from a Language Model
Expand Down
10 changes: 10 additions & 0 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ harness = false
name = "unigram_benchmark"
harness = false

[[bench]]
name = "llama3"
required-features = ["http"]
harness = false

[dependencies]
lazy_static = "1.4"
rand = "0.8"
Expand Down Expand Up @@ -80,3 +85,8 @@ tracing-subscriber = "0.3.18"

[profile.release]
lto = "fat"

[[example]]
name = "encode_batch"
required-features = ["http"]

42 changes: 42 additions & 0 deletions tokenizers/benches/llama3.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#[macro_use]
extern crate criterion;

use criterion::{Criterion, Throughput};
use tokenizers::Tokenizer;

pub fn llama3(c: &mut Criterion) {
let data = std::fs::read_to_string("data/big.txt").unwrap();
let mut group = c.benchmark_group("llama3-encode");
group.throughput(Throughput::Bytes(data.bytes().len() as u64));
group.bench_function("llama3-offsets", |b| {
let tokenizer =
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
let data: Vec<_> = data.lines().collect();
let add_special_tokens = false;
b.iter(|| {
tokenizer
.encode_batch_char_offsets(criterion::black_box(data.clone()), add_special_tokens)
.unwrap()
})
});
group.bench_function("llama3-nooffsets", |b| {
let tokenizer =
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
let data: Vec<_> = data.lines().collect();
let add_special_tokens = false;
b.iter(|| {
tokenizer
.encode_batch(criterion::black_box(data.clone()), add_special_tokens)
.unwrap()
})
});
group.finish();
}

criterion_group! {
name = bert_benches;
config = Criterion::default().sample_size(10);
targets = llama3
}

criterion_main!(bert_benches);
11 changes: 11 additions & 0 deletions tokenizers/examples/encode_batch.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
use tokenizers::Tokenizer;

fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?;

let data = std::fs::read_to_string("data/big.txt")?;
let data: Vec<_> = data.lines().collect();
let add_special_tokens = false;
tokenizer.encode_batch_char_offsets(data, add_special_tokens)?;
Ok(())
}
64 changes: 64 additions & 0 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,48 @@ where
}
}

/// Encode the given input. This method accepts both single sequences, as well as pair
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
///
/// ```
/// # use tokenizers::Tokenizer;
/// # use tokenizers::models::bpe::BPE;
/// # let mut tokenizer = Tokenizer::new(BPE::default());
/// #
/// // Sequences:
/// tokenizer.encode_fast("Single sequence", false);
/// tokenizer.encode_fast(("Sequence A", "Sequence B"), false);
///
/// // Pre-tokenized sequences:
/// tokenizer.encode_fast(&["Single", "sequence"][..], false);
/// tokenizer.encode_fast((
/// &["Sequence", "A"][..],
/// &["Sequence", "B"][..]
/// ), false);
///
/// // or even both types together:
/// tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);
/// ```
pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
where
E: Into<EncodeInput<'s>>,
{
// Extract sequences from the EncodeInput
let (sequence, pair) = match input.into() {
EncodeInput::Single(s1) => (s1, None),
EncodeInput::Dual(s1, s2) => (s1, Some(s2)),
};

// Encode each sequence
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::None)?;
let pair_encoding = pair
.map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::None))
.transpose()?;

// And finally post process
self.post_process(encoding, pair_encoding, add_special_tokens)
}

/// Encode the given input. This method accepts both single sequences, as well as pair
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
///
Expand Down Expand Up @@ -1059,6 +1101,28 @@ where
Ok(encodings)
}

/// Encode all the sentences in parallel, using multiple threads
pub fn encode_batch_fast<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
where
E: Into<EncodeInput<'s>> + Send,
{
let mut encodings = inputs
.into_maybe_par_iter()
.map(|input| self.encode_fast(input, add_special_tokens))
.collect::<Result<Vec<Encoding>>>()?;

if let Some(params) = &self.padding {
// We do the padding here to make sure we handle the batch padding
pad_encodings(&mut encodings, params)?;
}

Ok(encodings)
}

/// Decode all sentences in parallel
pub fn decode_batch(
&self,
Expand Down
15 changes: 15 additions & 0 deletions tokenizers/src/tokenizer/pre_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::collections::HashMap;
pub enum OffsetType {
Byte,
Char,
None,
}

/// Wrapper for a subpart of a `NormalizedString`.
Expand Down Expand Up @@ -146,6 +147,19 @@ impl PreTokenizedString {
let offset_converter = match offset_type {
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
OffsetType::Byte => None,
OffsetType::None => {
let tokens = self
.splits
.into_iter()
.flat_map(|split| {
split.tokens.unwrap().into_iter().map(|token| {
// Replace this with the actual fields you need for the Encoding type
(token.id, String::new(), (0, 0), None, 0)
})
})
.collect();
return Ok(tokens);
}
};

Ok(self
Expand Down Expand Up @@ -197,6 +211,7 @@ impl PreTokenizedString {
let offset_converter = match offset_type {
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
OffsetType::Byte => None,
OffsetType::None => None,
};

let mut offset = 0;
Expand Down
Loading