huggingface · Narsil · Aug 8, 2024 · Aug 2, 2024 · Aug 2, 2024 · Aug 2, 2024
diff --git a/bindings/python/benches/test_tiktoken.py b/bindings/python/benches/test_tiktoken.py
@@ -60,7 +60,12 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
             mergeable_ranks=mergeable_ranks,
             special_tokens=special_tokens,
         )
-    enc.encode("warmup")
+    out = enc.encode("This is a test")
+
+    hf_enc = Tokenizer.from_pretrained(model)
+    out2 = hf_enc.encode("This is a test", add_special_tokens=False).ids
+
+    assert out == out2, "sanity check"
 
     start = time.perf_counter_ns()
     enc.encode_ordinary_batch(documents, num_threads=num_threads)
@@ -69,11 +74,9 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
     readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
     print(f"tiktoken \t{readable_size}  / s")
 
-    hf_enc = Tokenizer.from_pretrained(model)
-    hf_enc.encode("warmup")
 
     start = time.perf_counter_ns()
-    hf_enc.encode_batch(documents)
+    hf_enc.encode_batch_fast(documents)
     end = time.perf_counter_ns()
     readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
     print(f"huggingface \t{readable_size} / s")
@@ -82,8 +85,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
 def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
     dataset_xnli = load_dataset(dataset, dataset_config)
 
-    # input_lengths = [(10, False), (10_000, False), (10_000, True)]  # Example input lengths
-    input_lengths = [(10_000, False, True), (10_000, False, False)]
+    input_lengths = [(10, False, True), (10_000, False, True), (10_000, False, False)]
 
     for num_threads in threads:
         for length, fuse, long in input_lengths:

diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi
@@ -892,6 +892,42 @@ class Tokenizer:
         """
         pass
 
+    def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
+        """
+        Encode the given batch of inputs. This method is faster than `encode_batch`
+        because it doesn't keep track of offsets, they will be all zeros.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                encode_batch_fast([
+                    "A single sequence",
+                    ("A tuple with a sequence", "And its pair"),
+                    [ "A", "pre", "tokenized", "sequence" ],
+                    ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+                ])
+
+        Args:
+            input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+                A list of single sequences or pair sequences to encode. Each sequence
+                can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+                argument:
+
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized
+
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+
+        """
+        pass
+
     @property
     def encode_special_tokens(self):
         """

diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
@@ -1055,6 +1055,67 @@ impl PyTokenizer {
         })
     }
 
+    /// Encode the given batch of inputs. This method is faster than `encode_batch`
+    /// because it doesn't keep track of offsets, they will be all zeros.
+    ///
+    /// Example:
+    ///     Here are some examples of the inputs that are accepted::
+    ///
+    ///         encode_batch_fast([
+    ///             "A single sequence",
+    ///             ("A tuple with a sequence", "And its pair"),
+    ///             [ "A", "pre", "tokenized", "sequence" ],
+    ///             ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+    ///         ])
+    ///
+    /// Args:
+    ///     input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+    ///         A list of single sequences or pair sequences to encode. Each sequence
+    ///         can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+    ///         argument:
+    ///
+    ///         - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+    ///         - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+    ///
+    ///     is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+    ///         Whether the input is already pre-tokenized
+    ///
+    ///     add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+    ///         Whether to add the special tokens
+    ///
+    /// Returns:
+    ///     A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+    ///
+    #[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
+    #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
+    fn encode_batch_fast(
+        &self,
+        py: Python<'_>,
+        input: Vec<&PyAny>,
+        is_pretokenized: bool,
+        add_special_tokens: bool,
+    ) -> PyResult<Vec<PyEncoding>> {
+        let input: Vec<tk::EncodeInput> = input
+            .into_iter()
+            .map(|o| {
+                let input: tk::EncodeInput = if is_pretokenized {
+                    o.extract::<PreTokenizedEncodeInput>()?.into()
+                } else {
+                    o.extract::<TextEncodeInput>()?.into()
+                };
+                Ok(input)
+            })
+            .collect::<PyResult<Vec<tk::EncodeInput>>>()?;
+        py.allow_threads(|| {
+            ToPyResult(
+                self.tokenizer
+                    .encode_batch_fast(input, add_special_tokens)
+                    .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
+            )
+            .into()
+        })
+    }
+
     /// Decode the given list of ids back to a string
     ///
     /// This is used to decode anything coming back from a Language Model

diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml
@@ -36,6 +36,11 @@ harness = false
 name = "unigram_benchmark"
 harness = false
 
+[[bench]]
+name = "llama3"
+required-features = ["http"]
+harness = false
+
 [dependencies]
 lazy_static = "1.4"
 rand = "0.8"
@@ -80,3 +85,8 @@ tracing-subscriber = "0.3.18"
 
 [profile.release]
 lto = "fat"
+
+[[example]]
+name = "encode_batch"
+required-features = ["http"]
+
diff --git a/tokenizers/benches/llama3.rs b/tokenizers/benches/llama3.rs
@@ -0,0 +1,42 @@
+#[macro_use]
+extern crate criterion;
+
+use criterion::{Criterion, Throughput};
+use tokenizers::Tokenizer;
+
+pub fn llama3(c: &mut Criterion) {
+    let data = std::fs::read_to_string("data/big.txt").unwrap();
+    let mut group = c.benchmark_group("llama3-encode");
+    group.throughput(Throughput::Bytes(data.bytes().len() as u64));
+    group.bench_function("llama3-offsets", |b| {
+        let tokenizer =
+            Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
+        let data: Vec<_> = data.lines().collect();
+        let add_special_tokens = false;
+        b.iter(|| {
+            tokenizer
+                .encode_batch_char_offsets(criterion::black_box(data.clone()), add_special_tokens)
+                .unwrap()
+        })
+    });
+    group.bench_function("llama3-nooffsets", |b| {
+        let tokenizer =
+            Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
+        let data: Vec<_> = data.lines().collect();
+        let add_special_tokens = false;
+        b.iter(|| {
+            tokenizer
+                .encode_batch(criterion::black_box(data.clone()), add_special_tokens)
+                .unwrap()
+        })
+    });
+    group.finish();
+}
+
+criterion_group! {
+    name = bert_benches;
+    config = Criterion::default().sample_size(10);
+    targets = llama3
+}
+
+criterion_main!(bert_benches);
diff --git a/tokenizers/examples/encode_batch.rs b/tokenizers/examples/encode_batch.rs
@@ -0,0 +1,11 @@
+use tokenizers::Tokenizer;
+
+fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?;
+
+    let data = std::fs::read_to_string("data/big.txt")?;
+    let data: Vec<_> = data.lines().collect();
+    let add_special_tokens = false;
+    tokenizer.encode_batch_char_offsets(data, add_special_tokens)?;
+    Ok(())
+}
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -759,6 +759,48 @@ where
         }
     }
 
+    /// Encode the given input. This method accepts both single sequences, as well as pair
+    /// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
+    ///
+    /// ```
+    /// # use tokenizers::Tokenizer;
+    /// # use tokenizers::models::bpe::BPE;
+    /// # let mut tokenizer = Tokenizer::new(BPE::default());
+    /// #
+    /// // Sequences:
+    /// tokenizer.encode_fast("Single sequence", false);
+    /// tokenizer.encode_fast(("Sequence A", "Sequence B"), false);
+    ///
+    /// // Pre-tokenized sequences:
+    /// tokenizer.encode_fast(&["Single", "sequence"][..], false);
+    /// tokenizer.encode_fast((
+    ///     &["Sequence", "A"][..],
+    ///     &["Sequence", "B"][..]
+    /// ), false);
+    ///
+    /// // or even both types together:
+    /// tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);
+    /// ```
+    pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
+    where
+        E: Into<EncodeInput<'s>>,
+    {
+        // Extract sequences from the EncodeInput
+        let (sequence, pair) = match input.into() {
+            EncodeInput::Single(s1) => (s1, None),
+            EncodeInput::Dual(s1, s2) => (s1, Some(s2)),
+        };
+
+        // Encode each sequence
+        let encoding = self.encode_single_sequence(sequence, 0, OffsetType::None)?;
+        let pair_encoding = pair
+            .map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::None))
+            .transpose()?;
+
+        // And finally post process
+        self.post_process(encoding, pair_encoding, add_special_tokens)
+    }
+
     /// Encode the given input. This method accepts both single sequences, as well as pair
     /// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
     ///
@@ -1059,6 +1101,28 @@ where
         Ok(encodings)
     }
 
+    /// Encode all the sentences in parallel, using multiple threads
+    pub fn encode_batch_fast<'s, E>(
+        &self,
+        inputs: Vec<E>,
+        add_special_tokens: bool,
+    ) -> Result<Vec<Encoding>>
+    where
+        E: Into<EncodeInput<'s>> + Send,
+    {
+        let mut encodings = inputs
+            .into_maybe_par_iter()
+            .map(|input| self.encode_fast(input, add_special_tokens))
+            .collect::<Result<Vec<Encoding>>>()?;
+
+        if let Some(params) = &self.padding {
+            // We do the padding here to make sure we handle the batch padding
+            pad_encodings(&mut encodings, params)?;
+        }
+
+        Ok(encodings)
+    }
+
     /// Decode all sentences in parallel
     pub fn decode_batch(
         &self,

diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs
@@ -8,6 +8,7 @@ use std::collections::HashMap;
 pub enum OffsetType {
     Byte,
     Char,
+    None,
 }
 
 /// Wrapper for a subpart of a `NormalizedString`.
@@ -146,6 +147,19 @@ impl PreTokenizedString {
             let offset_converter = match offset_type {
                 OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
                 OffsetType::Byte => None,
+                OffsetType::None => {
+                    let tokens = self
+                        .splits
+                        .into_iter()
+                        .flat_map(|split| {
+                            split.tokens.unwrap().into_iter().map(|token| {
+                                // Replace this with the actual fields you need for the Encoding type
+                                (token.id, String::new(), (0, 0), None, 0)
+                            })
+                        })
+                        .collect();
+                    return Ok(tokens);
+                }
             };
 
             Ok(self
@@ -197,6 +211,7 @@ impl PreTokenizedString {
         let offset_converter = match offset_type {
             OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
             OffsetType::Byte => None,
+            OffsetType::None => None,
         };
 
         let mut offset = 0;