text chunker

This commit is contained in:
Karl 2025-09-23 16:00:27 +01:00
parent 2172d7da7f
commit 6aeeb74e8f
2 changed files with 97 additions and 114 deletions

View File

@ -8,12 +8,12 @@ class CLIPTextChunker:
Using a conservative limit of 70 tokens to account for special tokens.
"""
def __init__(self, max_tokens: int = 25):
def __init__(self, max_tokens: int = 70):
"""
Initialize the text chunker.
Args:
max_tokens (int): Maximum number of tokens per chunk (default: 25 for CLIP, being ultra conservative)
max_tokens (int): Maximum number of tokens per chunk (default: 70 for CLIP, leaving buffer for special tokens)
"""
self.max_tokens = max_tokens
self._tokenizer = None
@ -43,8 +43,9 @@ class CLIPTextChunker:
"""
if self.tokenizer is None:
# Fallback to character count if tokenizer not available
# Use an ultra conservative estimate: ~0.3 characters per token for CLIP
return int(len(text) * 0.3)
# CLIP tokenization is roughly 0.25-0.3 characters per token on average
# Use 0.25 for a more conservative estimate to avoid exceeding limits
return int(len(text) * 0.25)
tokens = self.tokenizer(
text,
@ -70,33 +71,45 @@ class CLIPTextChunker:
if not text.strip():
return []
# If text already fits within the limit, return as-is
if self.get_token_count(text) <= self.max_tokens:
return [text]
chunks = []
words = text.split()
current_chunk = []
current_tokens = 0
sentences = re.split(r'(?<=[.!?])\s+', text) if preserve_sentences else text.split()
for word in words:
word_with_space = word + " "
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# Check if adding this word would exceed the limit
test_chunk = " ".join(current_chunk + [word])
test_tokens = self.get_token_count(test_chunk)
# If a single sentence is too long, we need to break it down further
if self.get_token_count(sentence) > self.max_tokens:
# Break sentence into smaller chunks
words = sentence.split()
current_chunk = []
if test_tokens > self.max_tokens and current_chunk:
# Current chunk is complete, add it
chunks.append(" ".join(current_chunk))
current_chunk = [word]
current_tokens = self.get_token_count(word)
for word in words:
# Test if adding this word would exceed the limit
test_chunk = " ".join(current_chunk + [word])
if self.get_token_count(test_chunk) <= self.max_tokens:
current_chunk.append(word)
else:
# Current chunk is full, save it
if current_chunk:
chunks.append(" ".join(current_chunk))
# Start new chunk with current word
current_chunk = [word]
# Add the last chunk
if current_chunk:
chunks.append(" ".join(current_chunk))
else:
current_chunk.append(word)
current_tokens = test_tokens
# Add the last chunk if it exists
if current_chunk:
chunks.append(" ".join(current_chunk))
# Check if adding this sentence to the last chunk would exceed the limit
if chunks and self.get_token_count(chunks[-1] + " " + sentence) <= self.max_tokens:
chunks[-1] += " " + sentence
else:
chunks.append(sentence)
return chunks
@ -135,34 +148,31 @@ class CLIPTextChunker:
if last_space > 0:
candidate_chunk = candidate_chunk[:last_space]
# Use the basic chunking to ensure proper word boundaries
# Check if this candidate chunk fits within token limits
if self.get_token_count(candidate_chunk) <= self.max_tokens:
# Use chunk_text to get a properly bounded chunk
temp_chunks = self.chunk_text(candidate_chunk)
if temp_chunks:
first_chunk = temp_chunks[0]
remaining_text = text[len(first_chunk):]
break
first_chunk = candidate_chunk
remaining_text = text[len(first_chunk):].strip()
break
# If we found a good first chunk, use it
if first_chunk and self.get_token_count(first_chunk) <= self.max_tokens:
chunks = [first_chunk]
# Add remaining text as additional chunks if needed
if remaining_text.strip():
if remaining_text:
chunks.extend(self.chunk_text(remaining_text))
return chunks
# Fallback to regular chunking
return self.chunk_text(text)
def chunk_prompt_for_clip(prompt: str, max_tokens: int = 25) -> List[str]:
def chunk_prompt_for_clip(prompt: str, max_tokens: int = 70) -> List[str]:
"""
Convenience function to chunk a prompt for CLIP processing.
Uses a conservative 25 token limit to be safe.
Uses a 70 token limit to be safe while allowing meaningful prompts.
Args:
prompt (str): The prompt to chunk
max_tokens (int): Maximum tokens per chunk (default: 25 for safety)
max_tokens (int): Maximum tokens per chunk (default: 70 for CLIP compatibility)
Returns:
List[str]: List of prompt chunks

View File

@ -1,110 +1,83 @@
#!/usr/bin/env python3
"""
Test script for the CLIP text chunking functionality.
Test script to verify that the text chunker fixes the token sequence length issues.
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Add the lib directory to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(__file__), 'lib'))
from lib.text_chunker import chunk_prompt_for_clip, CLIPTextChunker
from text_chunker import CLIPTextChunker, chunk_prompt_for_clip
def test_long_prompt_chunking():
"""Test that long prompts are properly chunked within CLIP token limits."""
def test_basic_chunking():
"""Test basic text chunking functionality."""
print("=== Testing Basic Text Chunking ===")
# Create a sample long prompt similar to what the app generates
test_prompt = "Ultra-realistic close-up headshot of a Medium Brown skinned male soccer player with a plain background looking at the camera with his whole head in shot. The player is twenty-five years old, from United Kingdom, with clean-shaven and Medium Length Brown curly hair. He is facing the camera with a confident expression, wearing a soccer jersey. The lighting is natural and soft, emphasizing facial features and skin texture"
chunker = CLIPTextChunker(max_tokens=60) # Using conservative limit
print(f"Original prompt length: {len(test_prompt)} characters")
print(f"Original prompt: {test_prompt}")
print("-" * 80)
# Test short text (should not be chunked)
short_text = "A simple prompt"
chunks = chunker.chunk_text(short_text)
print(f"Short text: '{short_text}' -> {len(chunks)} chunks")
assert len(chunks) == 1, f"Expected 1 chunk, got {len(chunks)}"
# Test the chunking
chunker = CLIPTextChunker(max_tokens=70)
chunks = chunk_prompt_for_clip(test_prompt)
# Test long text (should be chunked)
long_text = "This is a very long text that should definitely exceed the token limit when processed by CLIP. " * 10
chunks = chunker.chunk_text(long_text)
print(f"Long text -> {len(chunks)} chunks")
assert len(chunks) > 1, f"Expected multiple chunks, got {len(chunks)}"
# Verify each chunk is within token limit
for i, chunk in enumerate(chunks):
token_count = chunker.estimate_token_count(chunk)
print(f"Chunk {i+1}: {token_count} tokens (max: {chunker.max_tokens})")
assert token_count <= chunker.max_tokens, f"Chunk {i+1} exceeds token limit: {token_count} > {chunker.max_tokens}"
print("✓ Basic chunking test passed\n")
def test_prompt_chunking():
"""Test chunking with actual prompts similar to the app."""
print("=== Testing Prompt Chunking ===")
# Simulate a long prompt like the one from app_config.json
long_prompt = "Ultra-realistic close-up headshot of a Fair skinned male soccer player with a plain background looking at the camera with his whole head in shot. The player is twenty-five years old, from United Kingdom, with clean-shaven and curly hair. He is facing the camera with a confident expression, wearing a soccer jersey. The lighting is natural and soft, emphasizing facial features and skin texture"
chunks = chunk_prompt_for_clip(long_prompt)
print(f"Long prompt -> {len(chunks)} chunks")
print(f"Number of chunks: {len(chunks)}")
for i, chunk in enumerate(chunks):
print(f"Chunk {i+1}: {chunk[:100]}...")
token_count = chunker.get_token_count(chunk)
print(f"\nChunk {i+1}:")
print(f" Text: {chunk}")
print(f" Token count: {token_count}")
print(f" Character count: {len(chunk)}")
print("✓ Prompt chunking test passed\n")
if token_count > 77:
print(f" ❌ ERROR: Chunk {i+1} exceeds CLIP's 77 token limit!")
return False
elif token_count > 70:
print(f" ⚠️ WARNING: Chunk {i+1} is close to the 77 token limit")
else:
print(f" ✅ Chunk {i+1} is within safe limits")
def test_priority_chunking():
"""Test priority-based chunking."""
print("=== Testing Priority Chunking ===")
chunker = CLIPTextChunker(max_tokens=50) # Smaller limit for testing
text = "This is a long text with important information about soccer players and their characteristics. The most important part is that they are professional athletes."
essential_info = ["soccer players", "professional athletes", "important information"]
chunks = chunker.create_priority_chunks(text, essential_info)
print(f"Priority chunks -> {len(chunks)} chunks")
for i, chunk in enumerate(chunks):
print(f"Priority chunk {i+1}: {chunk}")
print("✓ Priority chunking test passed\n")
print("-" * 80)
print("✅ All chunks are within CLIP's token limits!")
return True
def test_edge_cases():
"""Test edge cases."""
print("=== Testing Edge Cases ===")
"""Test edge cases for the chunking functionality."""
chunker = CLIPTextChunker(max_tokens=60)
chunker = CLIPTextChunker(max_tokens=70)
# Test empty text
# Test empty string
chunks = chunker.chunk_text("")
assert len(chunks) == 0, "Empty text should return no chunks"
assert chunks == [], "Empty string should return empty list"
# Test text exactly at limit
exact_text = "A" * 60 # Text exactly at the character limit
chunks = chunker.chunk_text(exact_text)
# Should return the text as-is since it's exactly at the limit
assert len(chunks) == 1, f"Expected 1 chunk for text at limit, got {len(chunks)}"
assert chunks[0] == exact_text, "Text at limit should be returned unchanged"
# Test short string
short_text = "Hello world"
chunks = chunker.chunk_text(short_text)
assert len(chunks) == 1 and chunks[0] == short_text, "Short text should not be chunked"
# Test text that exceeds limit (with spaces so it can be split)
long_text = "This is a very long text that should definitely exceed the character limit when processed. " * 3 # Text that exceeds the limit
chunks = chunker.chunk_text(long_text)
assert len(chunks) > 1, f"Expected multiple chunks for long text, got {len(chunks)}"
# Test very long single word (edge case)
long_word = "a" * 200
chunks = chunker.chunk_text(long_word)
# Should handle this gracefully
for chunk in chunks:
assert chunker.estimate_token_count(chunk) <= chunker.max_tokens, f"Chunk exceeds limit: {len(chunk)} > {chunker.max_tokens}"
assert chunker.get_token_count(chunk) <= 70, "Long word chunks should respect token limit"
print("✓ Edge cases test passed\n")
print("✅ Edge case tests passed!")
return True
if __name__ == "__main__":
try:
test_basic_chunking()
test_prompt_chunking()
test_priority_chunking()
test_edge_cases()
print("Testing text chunker fixes...")
print("=" * 80)
print("🎉 All tests passed! Text chunking functionality is working correctly.")
success1 = test_long_prompt_chunking()
success2 = test_edge_cases()
except Exception as e:
print(f"❌ Test failed: {e}")
if success1 and success2:
print("\n🎉 All tests passed! The token sequence length issue should be fixed.")
sys.exit(0)
else:
print("\n❌ Some tests failed. The issue may not be fully resolved.")
sys.exit(1)