#!/usr/bin/env python3 """ Test script to verify that the text chunker fixes the token sequence length issues. """ import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) from lib.text_chunker import chunk_prompt_for_clip, CLIPTextChunker def test_long_prompt_chunking(): """Test that long prompts are properly chunked within CLIP token limits.""" # Create a sample long prompt similar to what the app generates test_prompt = "Ultra-realistic close-up headshot of a Medium Brown skinned male soccer player with a plain background looking at the camera with his whole head in shot. The player is twenty-five years old, from United Kingdom, with clean-shaven and Medium Length Brown curly hair. He is facing the camera with a confident expression, wearing a soccer jersey. The lighting is natural and soft, emphasizing facial features and skin texture" print(f"Original prompt length: {len(test_prompt)} characters") print(f"Original prompt: {test_prompt}") print("-" * 80) # Test the chunking chunker = CLIPTextChunker(max_tokens=60) chunks = chunk_prompt_for_clip(test_prompt) print(f"Number of chunks: {len(chunks)}") for i, chunk in enumerate(chunks): token_count = chunker.get_token_count(chunk) print(f"\nChunk {i+1}:") print(f" Text: {chunk}") print(f" Token count: {token_count}") print(f" Character count: {len(chunk)}") if token_count > 77: print(f" ❌ ERROR: Chunk {i+1} exceeds CLIP's 77 token limit!") return False elif token_count > 60: print(f" ⚠️ WARNING: Chunk {i+1} is close to the 60 token limit") else: print(f" ✅ Chunk {i+1} is within safe limits") print("-" * 80) print("✅ All chunks are within CLIP's token limits!") return True def test_edge_cases(): """Test edge cases for the chunking functionality.""" chunker = CLIPTextChunker(max_tokens=60) # Test empty string chunks = chunker.chunk_text("") assert chunks == [], "Empty string should return empty list" # Test short string short_text = "Hello world" chunks = chunker.chunk_text(short_text) assert len(chunks) == 1 and chunks[0] == short_text, "Short text should not be chunked" # Test very long single word (edge case) long_word = "a" * 200 chunks = chunker.chunk_text(long_word) # Should handle this gracefully for chunk in chunks: assert chunker.get_token_count(chunk) <= 60, "Long word chunks should respect token limit" print("✅ Edge case tests passed!") return True if __name__ == "__main__": print("Testing text chunker fixes...") print("=" * 80) success1 = test_long_prompt_chunking() success2 = test_edge_cases() if success1 and success2: print("\n🎉 All tests passed! The token sequence length issue should be fixed.") sys.exit(0) else: print("\n❌ Some tests failed. The issue may not be fully resolved.") sys.exit(1)