Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/cohere/manually_maintained/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,7 @@ def _get_tokenizer_config_size(tokenizer_url: str) -> float:
if size:
break

return round(int(typing.cast(int, size)) / 1024 / 1024, 2)
if size is None:
raise ValueError("Content-Length unavailable (server may use chunked transfer encoding)")

return round(int(size) / 1024 / 1024, 2)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

keep the typing cast here unless you had a specific reason to remove it

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.

47 changes: 47 additions & 0 deletions tests/test_tokenizer_config_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import sys
import types
import unittest
from unittest.mock import MagicMock, patch

# Stub out the `tokenizers` C-extension so the module can be imported in CI
# without the native library present.
if "tokenizers" not in sys.modules:
tokenizers_stub = types.ModuleType("tokenizers")
tokenizers_stub.Tokenizer = object # type: ignore[attr-defined]
sys.modules["tokenizers"] = tokenizers_stub

from cohere.manually_maintained.tokenizers import _get_tokenizer_config_size


class TestGetTokenizerConfigSize(unittest.TestCase):
def _make_head_response(self, headers: dict) -> MagicMock:
resp = MagicMock()
resp.headers = headers
return resp

def test_content_length_header(self) -> None:
with patch("requests.head", return_value=self._make_head_response({"Content-Length": "2097152"})):
size = _get_tokenizer_config_size("https://example.com/tokenizer.json")
self.assertAlmostEqual(size, 2.0)

def test_goog_stored_content_length_header(self) -> None:
with patch("requests.head", return_value=self._make_head_response({"x-goog-stored-content-length": "1048576"})):
size = _get_tokenizer_config_size("https://example.com/tokenizer.json")
self.assertAlmostEqual(size, 1.0)

def test_goog_header_takes_priority_over_content_length(self) -> None:
with patch(
"requests.head",
return_value=self._make_head_response(
{"x-goog-stored-content-length": "1048576", "Content-Length": "2097152"}
),
):
size = _get_tokenizer_config_size("https://example.com/tokenizer.json")
self.assertAlmostEqual(size, 1.0)

def test_raises_value_error_when_no_size_header(self) -> None:
"""Chunked-transfer responses omit Content-Length; must raise ValueError, not TypeError."""
with patch("requests.head", return_value=self._make_head_response({})):
with self.assertRaises(ValueError) as ctx:
_get_tokenizer_config_size("https://example.com/tokenizer.json")
self.assertIn("Content-Length unavailable", str(ctx.exception))