Partial sync of codebase

hauntsaninja · hauntsaninja · commit eedc85636450 · 2025-08-08T15:00:29.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ This is the changelog for the open source version of tiktoken.
 - Update version of `pyo3`
 - Use new Rust edition
 - Fix special token handling in `encode_to_numpy`
+- Better error handling
 - Improvements to private APIs
 
 ## [v0.10.0]
diff --git a/scripts/wheel_download.py b/scripts/wheel_download.py
@@ -0,0 +1,56 @@
+import argparse
+import zipfile
+from pathlib import Path
+
+import requests
+
+
+def download_artifacts(token, owner, repo, run_id, output_dir):
+    headers = {"Authorization": f"token {token}", "Accept": "application/vnd.github.v3+json"}
+
+    # Get list of artifacts
+    artifacts_url = f"https://api.github.com/repos/{owner}/{repo}/actions/runs/{run_id}/artifacts"
+    response = requests.get(artifacts_url, headers=headers)
+    response.raise_for_status()
+    artifacts = response.json()["artifacts"]
+
+    if not artifacts:
+        print(f"No artifacts found for run ID: {run_id}")
+        return
+
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Found {len(artifacts)} artifacts")
+    for artifact in artifacts:
+        name = artifact["name"]
+        download_url = artifact["archive_download_url"]
+
+        print(f"Downloading {name}...")
+
+        response = requests.get(download_url, headers=headers, stream=True)
+        response.raise_for_status()
+
+        temp_zip = output_dir / f"{name}.zip"
+        with open(temp_zip, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        with zipfile.ZipFile(temp_zip, "r") as zip_ref:
+            zip_ref.extractall(output_dir)
+        temp_zip.unlink()
+        print(f"Downloaded and extracted {name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download artifacts from a GitHub Actions run")
+    parser.add_argument("--token", required=True, help="GitHub Personal Access Token")
+    parser.add_argument("--owner", required=True, help="Repository owner")
+    parser.add_argument("--repo", required=True, help="Repository name")
+    parser.add_argument("--run-id", required=True, help="Workflow run ID")
+    parser.add_argument(
+        "--output-dir", default="artifacts", help="Output directory for downloaded artifacts"
+    )
+
+    args = parser.parse_args()
+
+    download_artifacts(args.token, args.owner, args.repo, args.run_id, args.output_dir)
diff --git a/src/lib.rs b/src/lib.rs
@@ -172,6 +172,19 @@ impl std::fmt::Display for DecodeError {
 
 impl std::error::Error for DecodeError {}
 
+#[derive(Debug, Clone)]
+pub struct EncodeError {
+    pub message: String,
+}
+
+impl std::fmt::Display for EncodeError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "Could not encode string: {}", self.message)
+    }
+}
+
+impl std::error::Error for EncodeError {}
+
 const MAX_NUM_THREADS: usize = 128;
 
 #[cfg_attr(feature = "python", pyclass)]
@@ -231,7 +244,11 @@ impl CoreBPE {
         ret
     }
 
-    pub fn encode(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec<Rank>, usize) {
+    pub fn encode(
+        &self,
+        text: &str,
+        allowed_special: &HashSet<&str>,
+    ) -> Result<(Vec<Rank>, usize), EncodeError> {
         let special_regex = self._get_tl_special_regex();
         let regex = self._get_tl_regex();
         let mut ret = vec![];
@@ -257,8 +274,17 @@ impl CoreBPE {
             let end = next_special.map_or(text.len(), |m| m.start());
 
             // Okay, here we go, compare this logic to encode_ordinary
-            for mat in regex.find_iter(&text[start..end]) {
-                let piece = mat.unwrap().as_str().as_bytes();
+            for mat_res in regex.find_iter(&text[start..end]) {
+                let mat = match mat_res {
+                    Ok(m) => m,
+                    Err(e) => {
+                        return Err(EncodeError {
+                            message: format!("Regex error while tokenizing: {e}"),
+                        });
+                    }
+                };
+
+                let piece = mat.as_str().as_bytes();
                 if let Some(token) = self.encoder.get(piece) {
                     last_piece_token_len = 1;
                     ret.push(*token);
@@ -284,7 +310,7 @@ impl CoreBPE {
 
         // last_piece_token_len is how many tokens came from the last regex split. This is used
         // for determining unstable tokens, since you can't merge across (stable) regex splits
-        (ret, last_piece_token_len)
+        Ok((ret, last_piece_token_len))
     }
 
     fn _increase_last_piece_token_len(
@@ -331,7 +357,7 @@ impl CoreBPE {
         text: &str,
         allowed_special: &HashSet<&str>,
     ) -> (Vec<Rank>, HashSet<Vec<Rank>>) {
-        let (tokens, last_piece_token_len) = self.encode(text, allowed_special);
+        let (tokens, last_piece_token_len) = self.encode(text, allowed_special).unwrap();
         if last_piece_token_len == 0 {
             // If last_piece_token_len is zero, the last token was a special token and we have
             // no unstable bytes
@@ -427,7 +453,7 @@ impl CoreBPE {
         if unstable_bytes.len() > 1 {
             let last_decoded = bstr::decode_last_utf8(unstable_bytes.as_slice());
             if unstable_bytes.len() - last_decoded.1 > 0
-                && last_decoded.0.map_or(false, |c| c.is_whitespace())
+                && last_decoded.0.is_some_and(|c| c.is_whitespace())
             {
                 let mut reencoded = byte_pair_encode(
                     &unstable_bytes[..unstable_bytes.len() - last_decoded.1],
@@ -517,7 +543,7 @@ impl CoreBPE {
 
     pub fn encode_with_special_tokens(&self, text: &str) -> Vec<Rank> {
         let allowed_special = self.special_tokens();
-        self.encode(text, &allowed_special).0
+        self.encode(text, &allowed_special).unwrap().0
     }
 }
 
diff --git a/src/py.rs b/src/py.rs
@@ -1,10 +1,10 @@
 use std::collections::HashSet;
 
 use pyo3::{
-    PyResult, exceptions,
+    IntoPyObjectExt, PyResult, exceptions,
     prelude::*,
     pybacked::PyBackedStr,
-    types::{PyBytes, PyList, PyTuple},
+    types::{PyBytes, PyList},
 };
 use rustc_hash::FxHashMap as HashMap;
 
@@ -37,11 +37,14 @@ impl CoreBPE {
         py: Python,
         text: &str,
         allowed_special: HashSet<PyBackedStr>,
-    ) -> Vec<Rank> {
+    ) -> PyResult<Vec<Rank>> {
         py.allow_threads(|| {
             let allowed_special: HashSet<&str> =
                 allowed_special.iter().map(|s| s.as_ref()).collect();
-            self.encode(text, &allowed_special).0
+            match self.encode(text, &allowed_special) {
+                Ok((tokens, _)) => Ok(tokens),
+                Err(e) => Err(PyErr::new::<exceptions::PyValueError, _>(e.message)),
+            }
         })
     }
 
@@ -50,14 +53,20 @@ impl CoreBPE {
         py: Python,
         text: &str,
         allowed_special: HashSet<PyBackedStr>,
-    ) -> Py<PyAny> {
-        let tokens = py.allow_threads(|| {
+    ) -> PyResult<Py<PyAny>> {
+        let tokens_res = py.allow_threads(|| {
             let allowed_special: HashSet<&str> =
                 allowed_special.iter().map(|s| s.as_ref()).collect();
-            self.encode(text, &allowed_special).0
+            self.encode(text, &allowed_special)
         });
+
+        let tokens = match tokens_res {
+            Ok((tokens, _)) => tokens,
+            Err(e) => return Err(PyErr::new::<exceptions::PyValueError, _>(e.message)),
+        };
+
         let buffer = TiktokenBuffer { tokens };
-        buffer.into_py(py)
+        buffer.into_py_any(py)
     }
 
     fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<Rank> {
@@ -69,7 +78,8 @@ impl CoreBPE {
                 // Unicode space, so we make our best guess at where we would have splits
                 Err(e) => {
                     let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
-                    let (tokens, last_piece_token_len) = self.encode(text, &HashSet::new());
+                    let (tokens, last_piece_token_len) =
+                        self.encode(text, &HashSet::new()).unwrap();
                     let (mut tokens, last_piece_token_len) =
                         self._increase_last_piece_token_len(tokens, last_piece_token_len);
 
@@ -110,19 +120,14 @@ impl CoreBPE {
         py: Python,
         text: &str,
         allowed_special: HashSet<PyBackedStr>,
-    ) -> Py<PyTuple> {
-        let (tokens, completions) = py.allow_threads(|| {
+    ) -> PyResult<(Vec<Rank>, Py<PyList>)> {
+        let (tokens, completions): (Vec<Rank>, HashSet<Vec<Rank>>) = py.allow_threads(|| {
             let allowed_special: HashSet<&str> =
                 allowed_special.iter().map(|s| s.as_ref()).collect();
             self._encode_unstable_native(text, &allowed_special)
         });
-        let py_completions = PyList::new_bound(
-            py,
-            completions
-                .iter()
-                .map(|seq| PyList::new_bound(py, &seq[..])),
-        );
-        (tokens, py_completions).into_py(py)
+        let py_completions = PyList::new(py, completions.into_iter())?;
+        Ok((tokens, py_completions.into()))
     }
 
     fn encode_single_token(&self, piece: &[u8]) -> PyResult<Rank> {
@@ -151,17 +156,17 @@ impl CoreBPE {
     #[pyo3(name = "decode_bytes")]
     fn py_decode_bytes(&self, py: Python, tokens: Vec<Rank>) -> Result<Py<PyBytes>, PyErr> {
         match py.allow_threads(|| self.decode_bytes(&tokens)) {
-            Ok(bytes) => Ok(PyBytes::new_bound(py, &bytes).into()),
+            Ok(bytes) => Ok(PyBytes::new(py, &bytes).into()),
             Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!("{}", e))),
         }
     }
 
     fn decode_single_token_bytes(&self, py: Python, token: Rank) -> PyResult<Py<PyBytes>> {
         if let Some(bytes) = self.decoder.get(&token) {
-            return Ok(PyBytes::new_bound(py, bytes).into());
+            return Ok(PyBytes::new(py, bytes).into());
         }
         if let Some(bytes) = self.special_tokens_decoder.get(&token) {
-            return Ok(PyBytes::new_bound(py, bytes).into());
+            return Ok(PyBytes::new(py, bytes).into());
         }
         Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string()))
     }