new limits

This commit is contained in:
Karl 2025-09-23 15:53:37 +01:00
parent 650369b06f
commit 2172d7da7f

View File

@ -8,12 +8,12 @@ class CLIPTextChunker:
Using a conservative limit of 70 tokens to account for special tokens. Using a conservative limit of 70 tokens to account for special tokens.
""" """
def __init__(self, max_tokens: int = 40): def __init__(self, max_tokens: int = 25):
""" """
Initialize the text chunker. Initialize the text chunker.
Args: Args:
max_tokens (int): Maximum number of tokens per chunk (default: 40 for CLIP, being very conservative) max_tokens (int): Maximum number of tokens per chunk (default: 25 for CLIP, being ultra conservative)
""" """
self.max_tokens = max_tokens self.max_tokens = max_tokens
self._tokenizer = None self._tokenizer = None
@ -43,8 +43,8 @@ class CLIPTextChunker:
""" """
if self.tokenizer is None: if self.tokenizer is None:
# Fallback to character count if tokenizer not available # Fallback to character count if tokenizer not available
# Use a very conservative estimate: ~0.6 characters per token for CLIP # Use an ultra conservative estimate: ~0.3 characters per token for CLIP
return int(len(text) * 0.6) return int(len(text) * 0.3)
tokens = self.tokenizer( tokens = self.tokenizer(
text, text,
@ -155,14 +155,14 @@ class CLIPTextChunker:
# Fallback to regular chunking # Fallback to regular chunking
return self.chunk_text(text) return self.chunk_text(text)
def chunk_prompt_for_clip(prompt: str, max_tokens: int = 40) -> List[str]: def chunk_prompt_for_clip(prompt: str, max_tokens: int = 25) -> List[str]:
""" """
Convenience function to chunk a prompt for CLIP processing. Convenience function to chunk a prompt for CLIP processing.
Uses a conservative 40 token limit to be safe. Uses a conservative 25 token limit to be safe.
Args: Args:
prompt (str): The prompt to chunk prompt (str): The prompt to chunk
max_tokens (int): Maximum tokens per chunk (default: 40 for safety) max_tokens (int): Maximum tokens per chunk (default: 25 for safety)
Returns: Returns:
List[str]: List of prompt chunks List[str]: List of prompt chunks