Skip to content

Commit 5818d56

Browse files
committed
Partial sync of codebase
1 parent 3591ff1 commit 5818d56

File tree

11 files changed

+90
-55
lines changed

11 files changed

+90
-55
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,17 @@
22

33
This is the changelog for the open source version of tiktoken.
44

5+
## [v0.11.0]
6+
- Support for `GPT-5`
7+
- Update version of `pyo3`
8+
- Use new Rust edition
9+
- Fix special token handling in `encode_to_numpy`
10+
- Improvements to private APIs
11+
12+
## [v0.10.0]
13+
- Support for newer models
14+
- Improvements to private APIs
15+
516
## [v0.9.0]
617
- Support for `o1` and `o3` models
718
- Better error messages when loading invalid vocabulary files

Cargo.toml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
[package]
22
name = "tiktoken"
3-
version = "0.10.0"
4-
edition = "2021"
5-
rust-version = "1.57.0"
3+
version = "0.11.0"
4+
edition = "2024"
65

76
[lib]
87
name = "tiktoken"
@@ -15,7 +14,7 @@ python = [
1514
]
1615

1716
[dependencies]
18-
pyo3 = { version = "0.22.2", default-features = false, features = [
17+
pyo3 = { version = "0.24.1", default-features = false, features = [
1918
"extension-module",
2019
"macros",
2120
], optional = true }

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "tiktoken"
3-
version = "0.10.0"
3+
version = "0.11.0"
44
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
55
readme = "README.md"
66
license = { file = "LICENSE" }
@@ -22,7 +22,7 @@ requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"]
2222
build-frontend = "build"
2323
build-verbosity = 1
2424

25-
linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal"
25+
linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y"
2626
linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" }
2727
macos.before-all = "rustup target add aarch64-apple-darwin x86_64-apple-darwin"
2828
macos.environment = { MACOSX_DEPLOYMENT_TARGET = "10.12" }

src/lib.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,9 @@ impl CoreBPE {
481481

482482
assert!(
483483
encoder.len() == decoder.len(),
484-
"Encoder and decoder must be of equal length; maybe you had duplicate token indices in your encoder?"
484+
"Encoder and decoder must be of equal length. Encoder length: {}, decoder length: {}.\nMaybe you had duplicate token indices in your encoder?",
485+
encoder.len(),
486+
decoder.len()
485487
);
486488

487489
let special_tokens_decoder: HashMap<Rank, Vec<u8>> = special_tokens_encoder
@@ -524,7 +526,7 @@ mod tests {
524526
use fancy_regex::Regex;
525527
use rustc_hash::FxHashMap as HashMap;
526528

527-
use crate::{byte_pair_split, Rank};
529+
use crate::{Rank, byte_pair_split};
528530

529531
fn setup_ranks() -> HashMap<Vec<u8>, Rank> {
530532
HashMap::from_iter([(b"ab".to_vec(), 0), (b"cd".to_vec(), 1)])

src/py.rs

Lines changed: 41 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
use std::collections::HashSet;
22

33
use pyo3::{
4-
exceptions,
4+
PyResult, exceptions,
55
prelude::*,
66
pybacked::PyBackedStr,
77
types::{PyBytes, PyList, PyTuple},
8-
PyResult,
98
};
109
use rustc_hash::FxHashMap as HashMap;
1110

12-
use crate::{byte_pair_encode, CoreBPE, Rank};
11+
use crate::{CoreBPE, Rank, byte_pair_encode};
1312

1413
#[pymethods]
1514
impl CoreBPE {
@@ -19,12 +18,8 @@ impl CoreBPE {
1918
special_tokens_encoder: HashMap<String, Rank>,
2019
pattern: &str,
2120
) -> PyResult<Self> {
22-
Self::new_internal(
23-
encoder,
24-
special_tokens_encoder,
25-
pattern,
26-
)
27-
.map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))
21+
Self::new_internal(encoder, special_tokens_encoder, pattern)
22+
.map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))
2823
}
2924

3025
// ====================
@@ -178,7 +173,7 @@ impl CoreBPE {
178173
fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> {
179174
self.sorted_token_bytes
180175
.iter()
181-
.map(|x| PyBytes::new_bound(py, x).into())
176+
.map(|x| PyBytes::new(py, x).into())
182177
.collect()
183178
}
184179
}
@@ -204,39 +199,47 @@ impl TiktokenBuffer {
204199
"Object is not writable",
205200
));
206201
}
207-
208-
(*view).obj = slf.clone().into_any().into_ptr();
209-
210-
let data = &slf.borrow().tokens;
211-
(*view).buf = data.as_ptr() as *mut std::os::raw::c_void;
212-
(*view).len = (data.len() * std::mem::size_of::<Rank>()) as isize;
213-
(*view).readonly = 1;
214-
(*view).itemsize = std::mem::size_of::<Rank>() as isize;
215-
(*view).format = if (flags & pyo3::ffi::PyBUF_FORMAT) == pyo3::ffi::PyBUF_FORMAT {
216-
let msg = std::ffi::CString::new("I").unwrap();
217-
msg.into_raw()
218-
} else {
219-
std::ptr::null_mut()
220-
};
221-
(*view).ndim = 1;
222-
(*view).shape = if (flags & pyo3::ffi::PyBUF_ND) == pyo3::ffi::PyBUF_ND {
223-
&mut (*view).len
224-
} else {
225-
std::ptr::null_mut()
226-
};
227-
(*view).strides = if (flags & pyo3::ffi::PyBUF_STRIDES) == pyo3::ffi::PyBUF_STRIDES {
228-
&mut (*view).itemsize
229-
} else {
230-
std::ptr::null_mut()
231-
};
232-
(*view).suboffsets = std::ptr::null_mut();
233-
(*view).internal = std::ptr::null_mut();
202+
unsafe {
203+
let view_ref = &mut *view;
204+
view_ref.obj = slf.clone().into_any().into_ptr();
205+
206+
let data = &slf.borrow().tokens;
207+
view_ref.buf = data.as_ptr() as *mut std::os::raw::c_void;
208+
view_ref.len = (data.len() * std::mem::size_of::<Rank>()) as isize;
209+
view_ref.readonly = 1;
210+
view_ref.itemsize = std::mem::size_of::<Rank>() as isize;
211+
view_ref.format = if (flags & pyo3::ffi::PyBUF_FORMAT) == pyo3::ffi::PyBUF_FORMAT {
212+
let msg = std::ffi::CString::new("I").unwrap();
213+
msg.into_raw()
214+
} else {
215+
std::ptr::null_mut()
216+
};
217+
view_ref.ndim = 1;
218+
view_ref.shape = if (flags & pyo3::ffi::PyBUF_ND) == pyo3::ffi::PyBUF_ND {
219+
&mut view_ref.len
220+
} else {
221+
std::ptr::null_mut()
222+
};
223+
view_ref.strides = if (flags & pyo3::ffi::PyBUF_STRIDES) == pyo3::ffi::PyBUF_STRIDES {
224+
&mut view_ref.itemsize
225+
} else {
226+
std::ptr::null_mut()
227+
};
228+
view_ref.suboffsets = std::ptr::null_mut();
229+
view_ref.internal = std::ptr::null_mut();
230+
}
234231

235232
Ok(())
236233
}
237234

238235
unsafe fn __releasebuffer__(&self, view: *mut pyo3::ffi::Py_buffer) {
239-
std::mem::drop(std::ffi::CString::from_raw((*view).format));
236+
// Note that Py_buffer doesn't have a Drop impl
237+
unsafe {
238+
let view_ref = &mut *view;
239+
if !view_ref.format.is_null() {
240+
std::mem::drop(std::ffi::CString::from_raw(view_ref.format));
241+
}
242+
}
240243
}
241244
}
242245

tests/test_encoding.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ def test_simple_repeated():
4949
assert enc.encode("00000000000000000") == [8269, 10535, 830]
5050

5151

52+
def test_large_repeated():
53+
enc = tiktoken.get_encoding("o200k_base")
54+
55+
with pytest.raises(ValueError):
56+
enc.encode("x" * 1_000_000)
57+
58+
5259
def test_simple_regex():
5360
enc = tiktoken.get_encoding("cl100k_base")
5461
assert enc.encode("rer") == [38149]
@@ -85,7 +92,7 @@ def test_encode_bytes():
8592

8693
@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
8794
@hypothesis.given(bytestring=st.binary())
88-
@hypothesis.settings(deadline=None)
95+
@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)
8996
def test_hyp_encode_bytes(make_enc: Callable[[], tiktoken.Encoding], bytestring: bytes):
9097
enc = make_enc()
9198
assert enc.decode_bytes(enc._encode_bytes(bytestring)) == bytestring
@@ -140,7 +147,7 @@ def test_basic_roundtrip(make_enc):
140147

141148
@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
142149
@hypothesis.given(text=st.text())
143-
@hypothesis.settings(deadline=None)
150+
@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)
144151
def test_hyp_roundtrip(make_enc: Callable[[], tiktoken.Encoding], text):
145152
enc = make_enc()
146153

@@ -246,11 +253,11 @@ def test_batch_encode(make_enc: Callable[[], tiktoken.Encoding]):
246253

247254
@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
248255
@hypothesis.given(batch=st.lists(st.text()))
249-
@hypothesis.settings(deadline=None)
256+
@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)
250257
def test_hyp_batch_roundtrip(make_enc: Callable[[], tiktoken.Encoding], batch):
251258
enc = make_enc()
252259

253-
encoded = enc.encode_batch(batch)
254-
assert encoded == [enc.encode(t) for t in batch]
260+
encoded = enc.encode_batch(batch, allowed_special="all")
261+
assert encoded == [enc.encode(t, allowed_special="all") for t in batch]
255262
decoded = enc.decode_batch(encoded)
256263
assert decoded == batch

tests/test_misc.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ def test_encoding_for_model():
1717
assert enc.name == "cl100k_base"
1818
enc = tiktoken.encoding_for_model("gpt-4o")
1919
assert enc.name == "o200k_base"
20+
enc = tiktoken.encoding_for_model("gpt-oss-120b")
21+
assert enc.name == "o200k_harmony"
2022

2123

2224
def test_optional_blobfile_dependency():

tiktoken/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55
from .registry import get_encoding as get_encoding
66
from .registry import list_encoding_names as list_encoding_names
77

8-
__version__ = "0.10.0"
8+
__version__ = "0.11.0"

tiktoken/core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def encode_to_numpy(
155155

156156
import numpy as np
157157

158-
buffer = self._core_bpe.encode_to_tiktoken_buffer(text, self.special_tokens_set)
158+
buffer = self._core_bpe.encode_to_tiktoken_buffer(text, allowed_special)
159159
return np.frombuffer(buffer, dtype=np.uint32)
160160

161161
def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:
@@ -394,7 +394,7 @@ def _encode_only_native_bpe(self, text: str) -> list[int]:
394394
_unused_pat = regex.compile(self._pat_str)
395395
ret = []
396396
for piece in regex.findall(_unused_pat, text):
397-
ret.extend(self._core_bpe.encode_single_piece(piece))
397+
ret.extend(self._core_bpe.encode_single_piece(piece.encode("utf-8")))
398398
return ret
399399

400400
def _encode_bytes(self, text: bytes) -> list[int]:

tiktoken/load.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def read_file(blobpath: str) -> bytes:
1616
with blobfile.BlobFile(blobpath, "rb") as f:
1717
return f.read()
1818

19-
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
19+
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts.
2020
import requests
2121

2222
resp = requests.get(blobpath)
@@ -88,6 +88,7 @@ def data_gym_to_mergeable_bpe_ranks(
8888
encoder_json_file: str,
8989
vocab_bpe_hash: str | None = None,
9090
encoder_json_hash: str | None = None,
91+
clobber_one_byte_tokens: bool = False,
9192
) -> dict[bytes, int]:
9293
# NB: do not add caching to this function
9394
rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "]
@@ -109,7 +110,10 @@ def decode_data_gym(value: str) -> bytes:
109110
return bytes(data_gym_byte_to_byte[b] for b in value)
110111

111112
# add the single byte tokens
113+
# if clobber_one_byte_tokens is True, we'll replace these with ones from the encoder json
112114
bpe_ranks = {bytes([b]): i for i, b in enumerate(rank_to_intbyte)}
115+
del rank_to_intbyte
116+
113117
# add the merged tokens
114118
n = len(bpe_ranks)
115119
for first, second in bpe_merges:
@@ -126,6 +130,12 @@ def decode_data_gym(value: str) -> bytes:
126130
# drop these two special tokens if present, since they're not mergeable bpe tokens
127131
encoder_json_loaded.pop(b"<|endoftext|>", None)
128132
encoder_json_loaded.pop(b"<|startoftext|>", None)
133+
134+
if clobber_one_byte_tokens:
135+
for k in encoder_json_loaded:
136+
if len(k) == 1:
137+
bpe_ranks[k] = encoder_json_loaded[k]
138+
129139
assert bpe_ranks == encoder_json_loaded
130140

131141
return bpe_ranks

0 commit comments

Comments
 (0)