commit python-tiktoken for openSUSE:Factory

Source-Sync Thu, 11 Dec 2025 09:41:59 -0800

Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package python-tiktoken for openSUSE:Factory 
checked in at 2025-12-11 18:39:05
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-tiktoken (Old)
 and      /work/SRC/openSUSE:Factory/.python-tiktoken.new.1939 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "python-tiktoken"

Thu Dec 11 18:39:05 2025 rev:4 rq:1322099 version:0.12.0

Changes:
--------
--- /work/SRC/openSUSE:Factory/python-tiktoken/python-tiktoken.changes  
2025-03-05 13:42:50.278375137 +0100
+++ 
/work/SRC/openSUSE:Factory/.python-tiktoken.new.1939/python-tiktoken.changes    
    2025-12-11 18:40:51.888271635 +0100
@@ -1,0 +2,15 @@
+Thu Dec 11 04:41:27 UTC 2025 - Steve Kowalik <[email protected]>
+
+- Update to version 0.12.0:
+  * Release 0.12.0
+  * Partial sync of codebase (#451)
+  * Add GPT-5 model support with o200k_base encoding (#440)
+  * chore: update dependencies (#449)
+  * Support the free-threaded build (#443)
+  * bump PyO3 version (#444)
+  * Partial sync of codebase
+  * Partial sync of codebase
+  * Sync codebase
+  * Sync codebase (#389)
+
+-------------------------------------------------------------------

Old:
----
  tiktoken-0.9.0.tar.zst

New:
----
  tiktoken-0.12.0.obscpio
  tiktoken-0.12.0.tar.zst
  tiktoken.obsinfo

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-tiktoken.spec ++++++
--- /var/tmp/diff_new_pack.SjBMV5/_old  2025-12-11 18:40:52.756308112 +0100
+++ /var/tmp/diff_new_pack.SjBMV5/_new  2025-12-11 18:40:52.756308112 +0100
@@ -1,7 +1,7 @@
 #
 # spec file for package python-tiktoken
 #
-# Copyright (c) 2025 SUSE LLC
+# Copyright (c) 2025 SUSE LLC and contributors
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -18,7 +18,7 @@
 
 %{?sle15_python_module_pythons}
 Name:           python-tiktoken
-Version:        0.9.0
+Version:        0.12.0
 Release:        0
 Summary:        Fast BPE tokeniser for use with OpenAI's models
 License:        MIT

++++++ _service ++++++
--- /var/tmp/diff_new_pack.SjBMV5/_old  2025-12-11 18:40:52.812310465 +0100
+++ /var/tmp/diff_new_pack.SjBMV5/_new  2025-12-11 18:40:52.816310632 +0100
@@ -3,7 +3,7 @@
     <param name="url">https://github.com/openai/tiktoken.git</param>
     <param name="versionformat">@PARENT_TAG@</param>
     <param name="scm">git</param>
-    <param name="revision">0.9.0</param>
+    <param name="revision">0.12.0</param>
     <param name="match-tag">*</param>
     <param name="versionrewrite-pattern">v(\d+\.\d+\.\d+)</param>
     <param name="versionrewrite-replacement">\1</param>

++++++ _servicedata ++++++
--- /var/tmp/diff_new_pack.SjBMV5/_old  2025-12-11 18:40:52.844311809 +0100
+++ /var/tmp/diff_new_pack.SjBMV5/_new  2025-12-11 18:40:52.848311978 +0100
@@ -1,6 +1,6 @@
 <servicedata>
 <service name="tar_scm">
                 <param 
name="url">https://github.com/openai/tiktoken.git</param>
-              <param 
name="changesrevision">e35ab0915e37b919946b70947f1d0854196cb72c</param></service></servicedata>
+              <param 
name="changesrevision">97e49cbadd500b5cc9dbb51a486f0b42e6701bee</param></service></servicedata>
 (No newline at EOF)
 

++++++ tiktoken-0.9.0.tar.zst -> tiktoken-0.12.0.tar.zst ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/.github/workflows/build_wheels.yml 
new/tiktoken-0.12.0/.github/workflows/build_wheels.yml
--- old/tiktoken-0.9.0/.github/workflows/build_wheels.yml       2025-02-14 
06:53:03.000000000 +0100
+++ new/tiktoken-0.12.0/.github/workflows/build_wheels.yml      2025-10-06 
22:12:10.000000000 +0200
@@ -17,14 +17,15 @@
         # cibuildwheel builds linux wheels inside a manylinux container
         # it also takes care of procuring the correct python version for us
         os: [ubuntu-latest, windows-latest, macos-latest]
-        python-version: [39, 310, 311, 312, 313]
+        python-version: [39, 310, 311, 312, 313, 313t, 314, 314t]
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
 
-      - uses: pypa/[email protected]
+      - uses: pypa/[email protected]
         env:
           CIBW_BUILD: "cp${{ matrix.python-version}}-*"
+          CIBW_ENABLE: cpython-freethreading
 
       - uses: actions/upload-artifact@v4
         with:
@@ -38,24 +39,25 @@
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-22.04-arm]
-        python-version: [39, 310, 311, 312, 313]
+        os: [ubuntu-24.04-arm]
+        python-version: [39, 310, 311, 312, 313, 313t, 314, 314t]
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
 
       - name: Build wheels
-        uses: pypa/[email protected]
+        uses: pypa/[email protected]
         env:
           CIBW_BUILD: "cp${{ matrix.python-version}}-*"
           CIBW_ARCHS: aarch64
           CIBW_BUILD_VERBOSITY: 3
           # https://github.com/rust-lang/cargo/issues/10583
           CIBW_ENVIRONMENT_LINUX: PATH="$PATH:$HOME/.cargo/bin" 
CARGO_NET_GIT_FETCH_WITH_CLI=true
+          CIBW_ENABLE: cpython-freethreading
 
       - uses: actions/upload-artifact@v4
         with:
-          name: cibw-wheelsaarch64-${{ matrix.os }}-${{ strategy.job-index }}
+          name: cibw-wheels-aarch64-${{ matrix.os }}-${{ strategy.job-index }}
           path: ./wheelhouse/*.whl
 
   build_sdist:
@@ -63,8 +65,8 @@
     runs-on: ubuntu-latest
     timeout-minutes: 60
     steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - uses: actions/checkout@v5
+      - uses: actions/setup-python@v6
         name: Install Python
         with:
           python-version: "3.9"
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/CHANGELOG.md 
new/tiktoken-0.12.0/CHANGELOG.md
--- old/tiktoken-0.9.0/CHANGELOG.md     2025-02-14 06:53:03.000000000 +0100
+++ new/tiktoken-0.12.0/CHANGELOG.md    2025-10-06 22:12:10.000000000 +0200
@@ -2,6 +2,27 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.12.0]
+- Build wheels for Python 3.14
+- Build musllinux aarch64 wheels
+- Support for free-threaded Python
+- Update version of `pyo3` and `rustc-hash`
+- Avoid use of `blobfile` for reading local files
+- Recognise `gpt-5` model identifier
+- Minor performance improvement for file reading
+
+## [v0.11.0]
+- Support for `GPT-5`
+- Update version of `pyo3`
+- Use new Rust edition
+- Fix special token handling in `encode_to_numpy`
+- Better error handling
+- Improvements to private APIs
+
+## [v0.10.0]
+- Support for newer models
+- Improvements to private APIs
+
 ## [v0.9.0]
 - Support for `o1` and `o3` models
 - Better error messages when loading invalid vocabulary files
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/Cargo.toml 
new/tiktoken-0.12.0/Cargo.toml
--- old/tiktoken-0.9.0/Cargo.toml       2025-02-14 06:53:03.000000000 +0100
+++ new/tiktoken-0.12.0/Cargo.toml      2025-10-06 22:12:10.000000000 +0200
@@ -1,8 +1,7 @@
 [package]
 name = "tiktoken"
-version = "0.9.0"
-edition = "2021"
-rust-version = "1.57.0"
+version = "0.12.0"
+edition = "2024"
 
 [lib]
 name = "tiktoken"
@@ -15,7 +14,7 @@
 ]
 
 [dependencies]
-pyo3 = { version = "0.22.2", default-features = false, features = [
+pyo3 = { version = "0.26.0", default-features = false, features = [
     "extension-module",
     "macros",
 ], optional = true }
@@ -23,5 +22,5 @@
 # tiktoken dependencies
 fancy-regex = "0.13.0"
 regex = "1.10.3"
-rustc-hash = "1.1.0"
+rustc-hash = "2"
 bstr = "1.5.0"
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/pyproject.toml 
new/tiktoken-0.12.0/pyproject.toml
--- old/tiktoken-0.9.0/pyproject.toml   2025-02-14 06:53:03.000000000 +0100
+++ new/tiktoken-0.12.0/pyproject.toml  2025-10-06 22:12:10.000000000 +0200
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.9.0"
+version = "0.12.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -22,7 +22,7 @@
 build-frontend = "build"
 build-verbosity = 1
 
-linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs 
| sh -s -- -y --profile minimal"
+linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs 
| sh -s -- -y"
 linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" }
 macos.before-all = "rustup target add aarch64-apple-darwin x86_64-apple-darwin"
 macos.environment = { MACOSX_DEPLOYMENT_TARGET = "10.12" }
@@ -31,7 +31,6 @@
   "*-manylinux_i686",
   "*-musllinux_i686",
   "*-win32",
-  "*-musllinux_aarch64",
 ]
 macos.archs = ["x86_64", "arm64"]
 # When cross-compiling on Intel, it is not possible to test arm64 wheels.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/scripts/wheel_download.py 
new/tiktoken-0.12.0/scripts/wheel_download.py
--- old/tiktoken-0.9.0/scripts/wheel_download.py        1970-01-01 
01:00:00.000000000 +0100
+++ new/tiktoken-0.12.0/scripts/wheel_download.py       2025-10-06 
22:12:10.000000000 +0200
@@ -0,0 +1,56 @@
+import argparse
+import zipfile
+from pathlib import Path
+
+import requests
+
+
+def download_artifacts(token, owner, repo, run_id, output_dir):
+    headers = {"Authorization": f"token {token}", "Accept": 
"application/vnd.github.v3+json"}
+
+    # Get list of artifacts
+    artifacts_url = 
f"https://api.github.com/repos/{owner}/{repo}/actions/runs/{run_id}/artifacts";
+    response = requests.get(artifacts_url, headers=headers)
+    response.raise_for_status()
+    artifacts = response.json()["artifacts"]
+
+    if not artifacts:
+        print(f"No artifacts found for run ID: {run_id}")
+        return
+
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Found {len(artifacts)} artifacts")
+    for artifact in artifacts:
+        name = artifact["name"]
+        download_url = artifact["archive_download_url"]
+
+        print(f"Downloading {name}...")
+
+        response = requests.get(download_url, headers=headers, stream=True)
+        response.raise_for_status()
+
+        temp_zip = output_dir / f"{name}.zip"
+        with open(temp_zip, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        with zipfile.ZipFile(temp_zip, "r") as zip_ref:
+            zip_ref.extractall(output_dir)
+        temp_zip.unlink()
+        print(f"Downloaded and extracted {name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download artifacts from a 
GitHub Actions run")
+    parser.add_argument("--token", required=True, help="GitHub Personal Access 
Token")
+    parser.add_argument("--owner", required=True, help="Repository owner")
+    parser.add_argument("--repo", required=True, help="Repository name")
+    parser.add_argument("--run-id", required=True, help="Workflow run ID")
+    parser.add_argument(
+        "--output-dir", default="artifacts", help="Output directory for 
downloaded artifacts"
+    )
+
+    args = parser.parse_args()
+
+    download_artifacts(args.token, args.owner, args.repo, args.run_id, 
args.output_dir)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/src/lib.rs 
new/tiktoken-0.12.0/src/lib.rs
--- old/tiktoken-0.9.0/src/lib.rs       2025-02-14 06:53:03.000000000 +0100
+++ new/tiktoken-0.12.0/src/lib.rs      2025-10-06 22:12:10.000000000 +0200
@@ -172,9 +172,22 @@
 
 impl std::error::Error for DecodeError {}
 
+#[derive(Debug, Clone)]
+pub struct EncodeError {
+    pub message: String,
+}
+
+impl std::fmt::Display for EncodeError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "Could not encode string: {}", self.message)
+    }
+}
+
+impl std::error::Error for EncodeError {}
+
 const MAX_NUM_THREADS: usize = 128;
 
-#[cfg_attr(feature = "python", pyclass)]
+#[cfg_attr(feature = "python", pyclass(frozen))]
 #[derive(Clone)]
 pub struct CoreBPE {
     encoder: HashMap<Vec<u8>, Rank>,
@@ -231,7 +244,11 @@
         ret
     }
 
-    pub fn encode(&self, text: &str, allowed_special: &HashSet<&str>) -> 
(Vec<Rank>, usize) {
+    pub fn encode(
+        &self,
+        text: &str,
+        allowed_special: &HashSet<&str>,
+    ) -> Result<(Vec<Rank>, usize), EncodeError> {
         let special_regex = self._get_tl_special_regex();
         let regex = self._get_tl_regex();
         let mut ret = vec![];
@@ -256,9 +273,18 @@
             }
             let end = next_special.map_or(text.len(), |m| m.start());
 
-            // Okay, here we go, compare this logic to _encode_ordinary_native
-            for mat in regex.find_iter(&text[start..end]) {
-                let piece = mat.unwrap().as_str().as_bytes();
+            // Okay, here we go, compare this logic to encode_ordinary
+            for mat_res in regex.find_iter(&text[start..end]) {
+                let mat = match mat_res {
+                    Ok(m) => m,
+                    Err(e) => {
+                        return Err(EncodeError {
+                            message: format!("Regex error while tokenizing: 
{e}"),
+                        });
+                    }
+                };
+
+                let piece = mat.as_str().as_bytes();
                 if let Some(token) = self.encoder.get(piece) {
                     last_piece_token_len = 1;
                     ret.push(*token);
@@ -284,7 +310,7 @@
 
         // last_piece_token_len is how many tokens came from the last regex 
split. This is used
         // for determining unstable tokens, since you can't merge across 
(stable) regex splits
-        (ret, last_piece_token_len)
+        Ok((ret, last_piece_token_len))
     }
 
     fn _increase_last_piece_token_len(
@@ -331,7 +357,7 @@
         text: &str,
         allowed_special: &HashSet<&str>,
     ) -> (Vec<Rank>, HashSet<Vec<Rank>>) {
-        let (tokens, last_piece_token_len) = self.encode(text, 
allowed_special);
+        let (tokens, last_piece_token_len) = self.encode(text, 
allowed_special).unwrap();
         if last_piece_token_len == 0 {
             // If last_piece_token_len is zero, the last token was a special 
token and we have
             // no unstable bytes
@@ -398,7 +424,7 @@
                     // notice all the big holes in the previous unstable token 
implementation)
                     Err(_) => byte_pair_encode(&possibility, &self.encoder),
                     // Something like the following is intriguing but 
incorrect:
-                    // Err(e) => self._encode_ordinary_native(unsafe {
+                    // Err(e) => self.encode_ordinary(unsafe {
                     //     
std::str::from_utf8_unchecked(&possibility[..e.valid_up_to()])
                     // }),
                 };
@@ -427,7 +453,7 @@
         if unstable_bytes.len() > 1 {
             let last_decoded = 
bstr::decode_last_utf8(unstable_bytes.as_slice());
             if unstable_bytes.len() - last_decoded.1 > 0
-                && last_decoded.0.map_or(false, |c| c.is_whitespace())
+                && last_decoded.0.is_some_and(|c| c.is_whitespace())
             {
                 let mut reencoded = byte_pair_encode(
                     &unstable_bytes[..unstable_bytes.len() - last_decoded.1],
@@ -481,7 +507,9 @@
 
         assert!(
             encoder.len() == decoder.len(),
-            "Encoder and decoder must be of equal length; maybe you had 
duplicate token indices in your encoder?"
+            "Encoder and decoder must be of equal length. Encoder length: {}, 
decoder length: {}.\nMaybe you had duplicate token indices in your encoder?",
+            encoder.len(),
+            decoder.len()
         );
 
         let special_tokens_decoder: HashMap<Rank, Vec<u8>> = 
special_tokens_encoder
@@ -515,7 +543,7 @@
 
     pub fn encode_with_special_tokens(&self, text: &str) -> Vec<Rank> {
         let allowed_special = self.special_tokens();
-        self.encode(text, &allowed_special).0
+        self.encode(text, &allowed_special).unwrap().0
     }
 }
 
@@ -524,7 +552,7 @@
     use fancy_regex::Regex;
     use rustc_hash::FxHashMap as HashMap;
 
-    use crate::{byte_pair_split, Rank};
+    use crate::{Rank, byte_pair_split};
 
     fn setup_ranks() -> HashMap<Vec<u8>, Rank> {
         HashMap::from_iter([(b"ab".to_vec(), 0), (b"cd".to_vec(), 1)])
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/src/py.rs 
new/tiktoken-0.12.0/src/py.rs
--- old/tiktoken-0.9.0/src/py.rs        2025-02-14 06:53:03.000000000 +0100
+++ new/tiktoken-0.12.0/src/py.rs       2025-10-06 22:12:10.000000000 +0200
@@ -1,15 +1,14 @@
 use std::collections::HashSet;
 
 use pyo3::{
-    exceptions,
+    IntoPyObjectExt, PyResult, exceptions,
     prelude::*,
     pybacked::PyBackedStr,
-    types::{PyBytes, PyList, PyTuple},
-    PyResult,
+    types::{PyBytes, PyList},
 };
 use rustc_hash::FxHashMap as HashMap;
 
-use crate::{byte_pair_encode, CoreBPE, Rank};
+use crate::{CoreBPE, Rank, byte_pair_encode};
 
 #[pymethods]
 impl CoreBPE {
@@ -19,12 +18,8 @@
         special_tokens_encoder: HashMap<String, Rank>,
         pattern: &str,
     ) -> PyResult<Self> {
-        Self::new_internal(
-            encoder,
-            special_tokens_encoder,
-            pattern,
-        )
-        .map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))
+        Self::new_internal(encoder, special_tokens_encoder, pattern)
+            .map_err(|e| PyErr::new::<exceptions::PyValueError, 
_>(e.to_string()))
     }
 
     // ====================
@@ -33,7 +28,7 @@
 
     #[pyo3(name = "encode_ordinary")]
     fn py_encode_ordinary(&self, py: Python, text: &str) -> Vec<Rank> {
-        py.allow_threads(|| self.encode_ordinary(text))
+        py.detach(|| self.encode_ordinary(text))
     }
 
     #[pyo3(name = "encode")]
@@ -42,11 +37,14 @@
         py: Python,
         text: &str,
         allowed_special: HashSet<PyBackedStr>,
-    ) -> Vec<Rank> {
-        py.allow_threads(|| {
+    ) -> PyResult<Vec<Rank>> {
+        py.detach(|| {
             let allowed_special: HashSet<&str> =
                 allowed_special.iter().map(|s| s.as_ref()).collect();
-            self.encode(text, &allowed_special).0
+            match self.encode(text, &allowed_special) {
+                Ok((tokens, _)) => Ok(tokens),
+                Err(e) => Err(PyErr::new::<exceptions::PyValueError, 
_>(e.message)),
+            }
         })
     }
 
@@ -55,36 +53,54 @@
         py: Python,
         text: &str,
         allowed_special: HashSet<PyBackedStr>,
-    ) -> Py<PyAny> {
-        let tokens = py.allow_threads(|| {
+    ) -> PyResult<Py<PyAny>> {
+        let tokens_res = py.detach(|| {
             let allowed_special: HashSet<&str> =
                 allowed_special.iter().map(|s| s.as_ref()).collect();
-            self.encode(text, &allowed_special).0
+            self.encode(text, &allowed_special)
         });
+
+        let tokens = match tokens_res {
+            Ok((tokens, _)) => tokens,
+            Err(e) => return Err(PyErr::new::<exceptions::PyValueError, 
_>(e.message)),
+        };
+
         let buffer = TiktokenBuffer { tokens };
-        buffer.into_py(py)
+        buffer.into_py_any(py)
     }
 
     fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<Rank> {
-        py.allow_threads(|| {
+        py.detach(|| {
             match std::str::from_utf8(bytes) {
+                // Straightforward case
                 Ok(text) => self.encode_ordinary(text),
+                // Oops, don't actually have UTF-8. But we need to do the 
regex splitting in
+                // Unicode space, so we make our best guess at where we would 
have splits
                 Err(e) => {
                     let text = unsafe { 
std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
-                    let (tokens, last_piece_token_len) = self.encode(text, 
&HashSet::new());
+                    let (tokens, last_piece_token_len) =
+                        self.encode(text, &HashSet::new()).unwrap();
                     let (mut tokens, last_piece_token_len) =
                         self._increase_last_piece_token_len(tokens, 
last_piece_token_len);
+
+                    let mut unstable_bytes;
                     if !tokens.is_empty() && last_piece_token_len > 0 {
                         // Lop off the tokens from the last piece and run BPE 
on the remaining bytes
-                        // Somewhat niche, but this may not be correct if we'd 
have had a regex
-                        // split between the valid UTF-8 and the invalid 
bytes, which is why this
-                        // method is private
-                        let mut unstable_bytes = self
+                        // This likely matches what models see better, e.g. if 
you assume we're
+                        // dealing with truncated UTF-8 bytes.
+                        // Niche, but note this may not be correct if we'd 
have had a regex
+                        // split between the valid UTF-8 and the invalid bytes.
+                        unstable_bytes = self
                             .decode_bytes(&tokens[tokens.len() - 
last_piece_token_len..])
                             .unwrap();
                         
unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
 
                         tokens.truncate(tokens.len() - last_piece_token_len);
+                    } else {
+                        unstable_bytes = bytes[e.valid_up_to()..].to_vec();
+                    }
+
+                    if !unstable_bytes.is_empty() {
                         match self.encoder.get(&unstable_bytes) {
                             Some(token) => tokens.push(*token),
                             None => {
@@ -104,19 +120,14 @@
         py: Python,
         text: &str,
         allowed_special: HashSet<PyBackedStr>,
-    ) -> Py<PyTuple> {
-        let (tokens, completions) = py.allow_threads(|| {
+    ) -> PyResult<(Vec<Rank>, Py<PyList>)> {
+        let (tokens, completions): (Vec<Rank>, HashSet<Vec<Rank>>) = 
py.detach(|| {
             let allowed_special: HashSet<&str> =
                 allowed_special.iter().map(|s| s.as_ref()).collect();
             self._encode_unstable_native(text, &allowed_special)
         });
-        let py_completions = PyList::new_bound(
-            py,
-            completions
-                .iter()
-                .map(|seq| PyList::new_bound(py, &seq[..])),
-        );
-        (tokens, py_completions).into_py(py)
+        let py_completions = PyList::new(py, completions.into_iter())?;
+        Ok((tokens, py_completions.into()))
     }
 
     fn encode_single_token(&self, piece: &[u8]) -> PyResult<Rank> {
@@ -144,18 +155,18 @@
 
     #[pyo3(name = "decode_bytes")]
     fn py_decode_bytes(&self, py: Python, tokens: Vec<Rank>) -> 
Result<Py<PyBytes>, PyErr> {
-        match py.allow_threads(|| self.decode_bytes(&tokens)) {
-            Ok(bytes) => Ok(PyBytes::new_bound(py, &bytes).into()),
+        match py.detach(|| self.decode_bytes(&tokens)) {
+            Ok(bytes) => Ok(PyBytes::new(py, &bytes).into()),
             Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!("{}", 
e))),
         }
     }
 
     fn decode_single_token_bytes(&self, py: Python, token: Rank) -> 
PyResult<Py<PyBytes>> {
         if let Some(bytes) = self.decoder.get(&token) {
-            return Ok(PyBytes::new_bound(py, bytes).into());
+            return Ok(PyBytes::new(py, bytes).into());
         }
         if let Some(bytes) = self.special_tokens_decoder.get(&token) {
-            return Ok(PyBytes::new_bound(py, bytes).into());
+            return Ok(PyBytes::new(py, bytes).into());
         }
         Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string()))
     }
@@ -167,12 +178,12 @@
     fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> {
         self.sorted_token_bytes
             .iter()
-            .map(|x| PyBytes::new_bound(py, x).into())
+            .map(|x| PyBytes::new(py, x).into())
             .collect()
     }
 }
 
-#[pyclass]
+#[pyclass(frozen)]
 struct TiktokenBuffer {
     tokens: Vec<Rank>,
 }
@@ -193,43 +204,51 @@
                 "Object is not writable",
             ));
         }
-
-        (*view).obj = slf.clone().into_any().into_ptr();
-
-        let data = &slf.borrow().tokens;
-        (*view).buf = data.as_ptr() as *mut std::os::raw::c_void;
-        (*view).len = (data.len() * std::mem::size_of::<Rank>()) as isize;
-        (*view).readonly = 1;
-        (*view).itemsize = std::mem::size_of::<Rank>() as isize;
-        (*view).format = if (flags & pyo3::ffi::PyBUF_FORMAT) == 
pyo3::ffi::PyBUF_FORMAT {
-            let msg = std::ffi::CString::new("I").unwrap();
-            msg.into_raw()
-        } else {
-            std::ptr::null_mut()
-        };
-        (*view).ndim = 1;
-        (*view).shape = if (flags & pyo3::ffi::PyBUF_ND) == 
pyo3::ffi::PyBUF_ND {
-            &mut (*view).len
-        } else {
-            std::ptr::null_mut()
-        };
-        (*view).strides = if (flags & pyo3::ffi::PyBUF_STRIDES) == 
pyo3::ffi::PyBUF_STRIDES {
-            &mut (*view).itemsize
-        } else {
-            std::ptr::null_mut()
-        };
-        (*view).suboffsets = std::ptr::null_mut();
-        (*view).internal = std::ptr::null_mut();
+        unsafe {
+            let view_ref = &mut *view;
+            view_ref.obj = slf.clone().into_any().into_ptr();
+
+            let data = &slf.borrow().tokens;
+            view_ref.buf = data.as_ptr() as *mut std::os::raw::c_void;
+            view_ref.len = (data.len() * std::mem::size_of::<Rank>()) as isize;
+            view_ref.readonly = 1;
+            view_ref.itemsize = std::mem::size_of::<Rank>() as isize;
+            view_ref.format = if (flags & pyo3::ffi::PyBUF_FORMAT) == 
pyo3::ffi::PyBUF_FORMAT {
+                let msg = std::ffi::CString::new("I").unwrap();
+                msg.into_raw()
+            } else {
+                std::ptr::null_mut()
+            };
+            view_ref.ndim = 1;
+            view_ref.shape = if (flags & pyo3::ffi::PyBUF_ND) == 
pyo3::ffi::PyBUF_ND {
+                &mut view_ref.len
+            } else {
+                std::ptr::null_mut()
+            };
+            view_ref.strides = if (flags & pyo3::ffi::PyBUF_STRIDES) == 
pyo3::ffi::PyBUF_STRIDES {
+                &mut view_ref.itemsize
+            } else {
+                std::ptr::null_mut()
+            };
+            view_ref.suboffsets = std::ptr::null_mut();
+            view_ref.internal = std::ptr::null_mut();
+        }
 
         Ok(())
     }
 
     unsafe fn __releasebuffer__(&self, view: *mut pyo3::ffi::Py_buffer) {
-        std::mem::drop(std::ffi::CString::from_raw((*view).format));
+        // Note that Py_buffer doesn't have a Drop impl
+        unsafe {
+            let view_ref = &mut *view;
+            if !view_ref.format.is_null() {
+                std::mem::drop(std::ffi::CString::from_raw(view_ref.format));
+            }
+        }
     }
 }
 
-#[pymodule]
+#[pymodule(gil_used = false)]
 fn _tiktoken(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
     m.add_class::<CoreBPE>()?;
     Ok(())
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/tests/test_encoding.py 
new/tiktoken-0.12.0/tests/test_encoding.py
--- old/tiktoken-0.9.0/tests/test_encoding.py   2025-02-14 06:53:03.000000000 
+0100
+++ new/tiktoken-0.12.0/tests/test_encoding.py  2025-10-06 22:12:10.000000000 
+0200
@@ -49,6 +49,13 @@
     assert enc.encode("00000000000000000") == [8269, 10535, 830]
 
 
+def test_large_repeated():
+    enc = tiktoken.get_encoding("o200k_base")
+
+    with pytest.raises(ValueError):
+        enc.encode("x" * 1_000_000)
+
+
 def test_simple_regex():
     enc = tiktoken.get_encoding("cl100k_base")
     assert enc.encode("rer") == [38149]
@@ -78,6 +85,17 @@
 def test_encode_bytes():
     enc = tiktoken.get_encoding("cl100k_base")
     assert enc._encode_bytes(b" \xec\x8b\xa4\xed") == [62085]
+    for i in range(10):
+        bytestring = b"\x80" * i
+        assert enc.decode_bytes(enc._encode_bytes(bytestring)) == bytestring
+
+
[email protected]("make_enc", ENCODING_FACTORIES)
[email protected](bytestring=st.binary())
[email protected](deadline=None, max_examples=MAX_EXAMPLES)
+def test_hyp_encode_bytes(make_enc: Callable[[], tiktoken.Encoding], 
bytestring: bytes):
+    enc = make_enc()
+    assert enc.decode_bytes(enc._encode_bytes(bytestring)) == bytestring
 
 
 def test_encode_surrogate_pairs():
@@ -129,7 +147,7 @@
 
 @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
 @hypothesis.given(text=st.text())
[email protected](deadline=None)
[email protected](deadline=None, max_examples=MAX_EXAMPLES)
 def test_hyp_roundtrip(make_enc: Callable[[], tiktoken.Encoding], text):
     enc = make_enc()
 
@@ -235,11 +253,11 @@
 
 @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
 @hypothesis.given(batch=st.lists(st.text()))
[email protected](deadline=None)
[email protected](deadline=None, max_examples=MAX_EXAMPLES)
 def test_hyp_batch_roundtrip(make_enc: Callable[[], tiktoken.Encoding], batch):
     enc = make_enc()
 
-    encoded = enc.encode_batch(batch)
-    assert encoded == [enc.encode(t) for t in batch]
+    encoded = enc.encode_batch(batch, allowed_special="all")
+    assert encoded == [enc.encode(t, allowed_special="all") for t in batch]
     decoded = enc.decode_batch(encoded)
     assert decoded == batch
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/tests/test_misc.py 
new/tiktoken-0.12.0/tests/test_misc.py
--- old/tiktoken-0.9.0/tests/test_misc.py       2025-02-14 06:53:03.000000000 
+0100
+++ new/tiktoken-0.12.0/tests/test_misc.py      2025-10-06 22:12:10.000000000 
+0200
@@ -17,6 +17,8 @@
     assert enc.name == "cl100k_base"
     enc = tiktoken.encoding_for_model("gpt-4o")
     assert enc.name == "o200k_base"
+    enc = tiktoken.encoding_for_model("gpt-oss-120b")
+    assert enc.name == "o200k_harmony"
 
 
 def test_optional_blobfile_dependency():
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/tiktoken/__init__.py 
new/tiktoken-0.12.0/tiktoken/__init__.py
--- old/tiktoken-0.9.0/tiktoken/__init__.py     2025-02-14 06:53:03.000000000 
+0100
+++ new/tiktoken-0.12.0/tiktoken/__init__.py    2025-10-06 22:12:10.000000000 
+0200
@@ -5,4 +5,4 @@
 from .registry import get_encoding as get_encoding
 from .registry import list_encoding_names as list_encoding_names
 
-__version__ = "0.9.0"
+__version__ = "0.12.0"
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/tiktoken/core.py 
new/tiktoken-0.12.0/tiktoken/core.py
--- old/tiktoken-0.9.0/tiktoken/core.py 2025-02-14 06:53:03.000000000 +0100
+++ new/tiktoken-0.12.0/tiktoken/core.py        2025-10-06 22:12:10.000000000 
+0200
@@ -4,11 +4,11 @@
 from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, AbstractSet, Collection, Literal, NoReturn, 
Sequence
 
-import regex
-
 from tiktoken import _tiktoken
 
 if TYPE_CHECKING:
+    import re
+
     import numpy as np
     import numpy.typing as npt
 
@@ -155,7 +155,7 @@
 
         import numpy as np
 
-        buffer = self._core_bpe.encode_to_tiktoken_buffer(text, 
self.special_tokens_set)
+        buffer = self._core_bpe.encode_to_tiktoken_buffer(text, 
allowed_special)
         return np.frombuffer(buffer, dtype=np.uint32)
 
     def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) 
-> list[list[int]]:
@@ -391,10 +391,13 @@
 
     def _encode_only_native_bpe(self, text: str) -> list[int]:
         """Encodes a string into tokens, but do regex splitting in Python."""
+        # We need specifically `regex` in order to compile pat_str due to e.g. 
\p
+        import regex
+
         _unused_pat = regex.compile(self._pat_str)
         ret = []
         for piece in regex.findall(_unused_pat, text):
-            ret.extend(self._core_bpe.encode_single_piece(piece))
+            
ret.extend(self._core_bpe.encode_single_piece(piece.encode("utf-8")))
         return ret
 
     def _encode_bytes(self, text: bytes) -> list[int]:
@@ -423,9 +426,13 @@
 
 
 @functools.lru_cache(maxsize=128)
-def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]":
-    inner = "|".join(regex.escape(token) for token in tokens)
-    return regex.compile(f"({inner})")
+def _special_token_regex(tokens: frozenset[str]) -> re.Pattern[str]:
+    try:
+        import regex as re
+    except ImportError:
+        import re
+    inner = "|".join(re.escape(token) for token in tokens)
+    return re.compile(f"({inner})")
 
 
 def raise_disallowed_special_token(token: str) -> NoReturn:
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/tiktoken/load.py 
new/tiktoken-0.12.0/tiktoken/load.py
--- old/tiktoken-0.9.0/tiktoken/load.py 2025-02-14 06:53:03.000000000 +0100
+++ new/tiktoken-0.12.0/tiktoken/load.py        2025-10-06 22:12:10.000000000 
+0200
@@ -6,22 +6,26 @@
 
 
 def read_file(blobpath: str) -> bytes:
-    if not blobpath.startswith("http://";) and not 
blobpath.startswith("https://";):
-        try:
-            import blobfile
-        except ImportError as e:
-            raise ImportError(
-                "blobfile is not installed. Please install it by running `pip 
install blobfile`."
-            ) from e
-        with blobfile.BlobFile(blobpath, "rb") as f:
+    if "://" not in blobpath:
+        with open(blobpath, "rb", buffering=0) as f:
             return f.read()
 
-    # avoiding blobfile for public files helps avoid auth issues, like MFA 
prompts
-    import requests
+    if blobpath.startswith(("http://";, "https://";)):
+        # avoiding blobfile for public files helps avoid auth issues, like MFA 
prompts.
+        import requests
+
+        resp = requests.get(blobpath)
+        resp.raise_for_status()
+        return resp.content
 
-    resp = requests.get(blobpath)
-    resp.raise_for_status()
-    return resp.content
+    try:
+        import blobfile
+    except ImportError as e:
+        raise ImportError(
+            "blobfile is not installed. Please install it by running `pip 
install blobfile`."
+        ) from e
+    with blobfile.BlobFile(blobpath, "rb") as f:
+        return f.read()
 
 
 def check_hash(data: bytes, expected_hash: str) -> bool:
@@ -49,7 +53,7 @@
 
     cache_path = os.path.join(cache_dir, cache_key)
     if os.path.exists(cache_path):
-        with open(cache_path, "rb") as f:
+        with open(cache_path, "rb", buffering=0) as f:
             data = f.read()
         if expected_hash is None or check_hash(data, expected_hash):
             return data
@@ -88,6 +92,7 @@
     encoder_json_file: str,
     vocab_bpe_hash: str | None = None,
     encoder_json_hash: str | None = None,
+    clobber_one_byte_tokens: bool = False,
 ) -> dict[bytes, int]:
     # NB: do not add caching to this function
     rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and 
chr(b) != " "]
@@ -109,7 +114,10 @@
         return bytes(data_gym_byte_to_byte[b] for b in value)
 
     # add the single byte tokens
+    # if clobber_one_byte_tokens is True, we'll replace these with ones from 
the encoder json
     bpe_ranks = {bytes([b]): i for i, b in enumerate(rank_to_intbyte)}
+    del rank_to_intbyte
+
     # add the merged tokens
     n = len(bpe_ranks)
     for first, second in bpe_merges:
@@ -126,6 +134,12 @@
     # drop these two special tokens if present, since they're not mergeable 
bpe tokens
     encoder_json_loaded.pop(b"<|endoftext|>", None)
     encoder_json_loaded.pop(b"<|startoftext|>", None)
+
+    if clobber_one_byte_tokens:
+        for k in encoder_json_loaded:
+            if len(k) == 1:
+                bpe_ranks[k] = encoder_json_loaded[k]
+
     assert bpe_ranks == encoder_json_loaded
 
     return bpe_ranks
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/tiktoken/model.py 
new/tiktoken-0.12.0/tiktoken/model.py
--- old/tiktoken-0.9.0/tiktoken/model.py        2025-02-14 06:53:03.000000000 
+0100
+++ new/tiktoken-0.12.0/tiktoken/model.py       2025-10-06 22:12:10.000000000 
+0200
@@ -7,12 +7,17 @@
 MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     "o1-": "o200k_base",
     "o3-": "o200k_base",
+    "o4-mini-": "o200k_base",
     # chat
+    "gpt-5-": "o200k_base",
+    "gpt-4.5-": "o200k_base",
+    "gpt-4.1-": "o200k_base",
     "chatgpt-4o-": "o200k_base",
     "gpt-4o-": "o200k_base",  # e.g., gpt-4o-2024-05-13
     "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
     "gpt-35-turbo-": "cl100k_base",  # Azure deployment name
+    "gpt-oss-": "o200k_harmony",
     # fine-tuned
     "ft:gpt-4o": "o200k_base",
     "ft:gpt-4": "cl100k_base",
@@ -25,7 +30,10 @@
     # reasoning
     "o1": "o200k_base",
     "o3": "o200k_base",
+    "o4-mini": "o200k_base",
     # chat
+    "gpt-5": "o200k_base",
+    "gpt-4.1": "o200k_base",
     "gpt-4o": "o200k_base",
     "gpt-4": "cl100k_base",
     "gpt-3.5-turbo": "cl100k_base",
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.9.0/tiktoken_ext/openai_public.py 
new/tiktoken-0.12.0/tiktoken_ext/openai_public.py
--- old/tiktoken-0.9.0/tiktoken_ext/openai_public.py    2025-02-14 
06:53:03.000000000 +0100
+++ new/tiktoken-0.12.0/tiktoken_ext/openai_public.py   2025-10-06 
22:12:10.000000000 +0200
@@ -120,6 +120,37 @@
     }
 
 
+def o200k_harmony():
+    base_enc = o200k_base()
+    name = "o200k_harmony"
+    pat_str = base_enc["pat_str"]
+    mergeable_ranks = base_enc["mergeable_ranks"]
+    special_tokens = {
+        **base_enc["special_tokens"],
+        "<|startoftext|>": 199998,
+        "<|endoftext|>": 199999,
+        "<|reserved_200000|>": 200000,
+        "<|reserved_200001|>": 200001,
+        "<|return|>": 200002,
+        "<|constrain|>": 200003,
+        "<|reserved_200004|>": 200004,
+        "<|channel|>": 200005,
+        "<|start|>": 200006,
+        "<|end|>": 200007,
+        "<|message|>": 200008,
+        "<|reserved_200009|>": 200009,
+        "<|reserved_200010|>": 200010,
+        "<|reserved_200011|>": 200011,
+        "<|call|>": 200012,
+    } | {f"<|reserved_{i}|>": i for i in range(200013, 201088)}
+    return {
+        "name": name,
+        "pat_str": pat_str,
+        "mergeable_ranks": mergeable_ranks,
+        "special_tokens": special_tokens,
+    }
+
+
 ENCODING_CONSTRUCTORS = {
     "gpt2": gpt2,
     "r50k_base": r50k_base,
@@ -127,4 +158,5 @@
     "p50k_edit": p50k_edit,
     "cl100k_base": cl100k_base,
     "o200k_base": o200k_base,
+    "o200k_harmony": o200k_harmony,
 }

++++++ tiktoken.obsinfo ++++++
name: tiktoken
version: 0.12.0
mtime: 1759781530
commit: 97e49cbadd500b5cc9dbb51a486f0b42e6701bee

++++++ vendor.tar.zst ++++++
++++ 1210388 lines of diff (skipped)

commit python-tiktoken for openSUSE:Factory

Reply via email to