From e458e748a3085878ea9c7cb8747fdffdeb14ceaa Mon Sep 17 00:00:00 2001
From: Karl <karl@k-world.me.uk>
Date: Tue, 23 Sep 2025 17:08:52 +0100
Subject: [PATCH] text chunking

---
 lib/text_chunker.py  | 14 +++++++-------
 test_text_chunker.py | 10 +++++-----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/lib/text_chunker.py b/lib/text_chunker.py
index fd27470..e03a732 100644
--- a/lib/text_chunker.py
+++ b/lib/text_chunker.py
@@ -8,12 +8,12 @@ class CLIPTextChunker:
     Using a conservative limit of 70 tokens to account for special tokens.
     """
 
-    def __init__(self, max_tokens: int = 70):
+    def __init__(self, max_tokens: int = 60):
         """
         Initialize the text chunker.
 
         Args:
-            max_tokens (int): Maximum number of tokens per chunk (default: 70 for CLIP, leaving buffer for special tokens)
+            max_tokens (int): Maximum number of tokens per chunk (default: 60 for CLIP, being extra conservative)
         """
         self.max_tokens = max_tokens
         self._tokenizer = None
@@ -44,8 +44,8 @@ class CLIPTextChunker:
         if self.tokenizer is None:
             # Fallback to character count if tokenizer not available
             # CLIP tokenization is roughly 0.25-0.3 characters per token on average
-            # Use 0.25 for a more conservative estimate to avoid exceeding limits
-            return int(len(text) * 0.25)
+            # Use 0.2 for an ultra-conservative estimate to ensure we never exceed limits
+            return int(len(text) * 0.2)
 
         tokens = self.tokenizer(
             text,
@@ -165,14 +165,14 @@ class CLIPTextChunker:
         # Fallback to regular chunking
         return self.chunk_text(text)
 
-def chunk_prompt_for_clip(prompt: str, max_tokens: int = 70) -> List[str]:
+def chunk_prompt_for_clip(prompt: str, max_tokens: int = 60) -> List[str]:
     """
     Convenience function to chunk a prompt for CLIP processing.
-    Uses a 70 token limit to be safe while allowing meaningful prompts.
+    Uses a 60 token limit to be extra safe for any CLIP model.
 
     Args:
         prompt (str): The prompt to chunk
-        max_tokens (int): Maximum tokens per chunk (default: 70 for CLIP compatibility)
+        max_tokens (int): Maximum tokens per chunk (default: 60 for maximum CLIP compatibility)
 
     Returns:
         List[str]: List of prompt chunks
diff --git a/test_text_chunker.py b/test_text_chunker.py
index 3a453f7..9b5cbad 100644
--- a/test_text_chunker.py
+++ b/test_text_chunker.py
@@ -20,7 +20,7 @@ def test_long_prompt_chunking():
     print("-" * 80)
 
     # Test the chunking
-    chunker = CLIPTextChunker(max_tokens=70)
+    chunker = CLIPTextChunker(max_tokens=60)
     chunks = chunk_prompt_for_clip(test_prompt)
 
     print(f"Number of chunks: {len(chunks)}")
@@ -35,8 +35,8 @@ def test_long_prompt_chunking():
         if token_count > 77:
             print(f"  ❌ ERROR: Chunk {i+1} exceeds CLIP's 77 token limit!")
             return False
-        elif token_count > 70:
-            print(f"  ⚠️  WARNING: Chunk {i+1} is close to the 77 token limit")
+        elif token_count > 60:
+            print(f"  ⚠️  WARNING: Chunk {i+1} is close to the 60 token limit")
         else:
             print(f"  ✅ Chunk {i+1} is within safe limits")
 
@@ -47,7 +47,7 @@ def test_long_prompt_chunking():
 def test_edge_cases():
     """Test edge cases for the chunking functionality."""
 
-    chunker = CLIPTextChunker(max_tokens=70)
+    chunker = CLIPTextChunker(max_tokens=60)
 
     # Test empty string
     chunks = chunker.chunk_text("")
@@ -63,7 +63,7 @@ def test_edge_cases():
     chunks = chunker.chunk_text(long_word)
     # Should handle this gracefully
     for chunk in chunks:
-        assert chunker.get_token_count(chunk) <= 70, "Long word chunks should respect token limit"
+        assert chunker.get_token_count(chunk) <= 60, "Long word chunks should respect token limit"
 
     print("✅ Edge case tests passed!")
     return True