From 650369b06f8fbd31fb98417c084d0c3ae29a7e96 Mon Sep 17 00:00:00 2001 From: Karl Date: Tue, 23 Sep 2025 15:49:06 +0100 Subject: [PATCH] text chunking --- lib/text_chunker.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/lib/text_chunker.py b/lib/text_chunker.py index 3314294..93c417e 100644 --- a/lib/text_chunker.py +++ b/lib/text_chunker.py @@ -8,12 +8,12 @@ class CLIPTextChunker: Using a conservative limit of 70 tokens to account for special tokens. """ - def __init__(self, max_tokens: int = 70): + def __init__(self, max_tokens: int = 40): """ Initialize the text chunker. Args: - max_tokens (int): Maximum number of tokens per chunk (default: 70 for CLIP, being conservative) + max_tokens (int): Maximum number of tokens per chunk (default: 40 for CLIP, being very conservative) """ self.max_tokens = max_tokens self._tokenizer = None @@ -24,8 +24,9 @@ class CLIPTextChunker: if self._tokenizer is None: try: from transformers import CLIPTokenizer - self._tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") - except ImportError: + # Use a simpler model that should be more reliable + self._tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", local_files_only=False) + except Exception as e: # Fallback to character-based estimation if transformers not available self._tokenizer = None return self._tokenizer @@ -42,7 +43,8 @@ class CLIPTextChunker: """ if self.tokenizer is None: # Fallback to character count if tokenizer not available - return len(text) + # Use a very conservative estimate: ~0.6 characters per token for CLIP + return int(len(text) * 0.6) tokens = self.tokenizer( text, @@ -153,14 +155,14 @@ class CLIPTextChunker: # Fallback to regular chunking return self.chunk_text(text) -def chunk_prompt_for_clip(prompt: str, max_tokens: int = 70) -> List[str]: +def chunk_prompt_for_clip(prompt: str, max_tokens: int = 40) -> List[str]: """ Convenience function to chunk a prompt for CLIP processing. - Uses a conservative 70 token limit to be safe. + Uses a conservative 40 token limit to be safe. Args: prompt (str): The prompt to chunk - max_tokens (int): Maximum tokens per chunk (default: 70 for safety) + max_tokens (int): Maximum tokens per chunk (default: 40 for safety) Returns: List[str]: List of prompt chunks