#!/usr/bin/env python3 """ Test script for the CLIP text chunking functionality. """ import sys import os # Add the lib directory to the path so we can import our modules sys.path.append(os.path.join(os.path.dirname(__file__), 'lib')) from text_chunker import CLIPTextChunker, chunk_prompt_for_clip def test_basic_chunking(): """Test basic text chunking functionality.""" print("=== Testing Basic Text Chunking ===") chunker = CLIPTextChunker(max_tokens=60) # Using conservative limit # Test short text (should not be chunked) short_text = "A simple prompt" chunks = chunker.chunk_text(short_text) print(f"Short text: '{short_text}' -> {len(chunks)} chunks") assert len(chunks) == 1, f"Expected 1 chunk, got {len(chunks)}" # Test long text (should be chunked) long_text = "This is a very long text that should definitely exceed the token limit when processed by CLIP. " * 10 chunks = chunker.chunk_text(long_text) print(f"Long text -> {len(chunks)} chunks") assert len(chunks) > 1, f"Expected multiple chunks, got {len(chunks)}" # Verify each chunk is within token limit for i, chunk in enumerate(chunks): token_count = chunker.estimate_token_count(chunk) print(f"Chunk {i+1}: {token_count} tokens (max: {chunker.max_tokens})") assert token_count <= chunker.max_tokens, f"Chunk {i+1} exceeds token limit: {token_count} > {chunker.max_tokens}" print("✓ Basic chunking test passed\n") def test_prompt_chunking(): """Test chunking with actual prompts similar to the app.""" print("=== Testing Prompt Chunking ===") # Simulate a long prompt like the one from app_config.json long_prompt = "Ultra-realistic close-up headshot of a Fair skinned male soccer player with a plain background looking at the camera with his whole head in shot. The player is twenty-five years old, from United Kingdom, with clean-shaven and curly hair. He is facing the camera with a confident expression, wearing a soccer jersey. The lighting is natural and soft, emphasizing facial features and skin texture" chunks = chunk_prompt_for_clip(long_prompt) print(f"Long prompt -> {len(chunks)} chunks") for i, chunk in enumerate(chunks): print(f"Chunk {i+1}: {chunk[:100]}...") print("✓ Prompt chunking test passed\n") def test_priority_chunking(): """Test priority-based chunking.""" print("=== Testing Priority Chunking ===") chunker = CLIPTextChunker(max_tokens=50) # Smaller limit for testing text = "This is a long text with important information about soccer players and their characteristics. The most important part is that they are professional athletes." essential_info = ["soccer players", "professional athletes", "important information"] chunks = chunker.create_priority_chunks(text, essential_info) print(f"Priority chunks -> {len(chunks)} chunks") for i, chunk in enumerate(chunks): print(f"Priority chunk {i+1}: {chunk}") print("✓ Priority chunking test passed\n") def test_edge_cases(): """Test edge cases.""" print("=== Testing Edge Cases ===") chunker = CLIPTextChunker(max_tokens=60) # Test empty text chunks = chunker.chunk_text("") assert len(chunks) == 0, "Empty text should return no chunks" # Test text exactly at limit exact_text = "A" * 60 # Text exactly at the character limit chunks = chunker.chunk_text(exact_text) # Should return the text as-is since it's exactly at the limit assert len(chunks) == 1, f"Expected 1 chunk for text at limit, got {len(chunks)}" assert chunks[0] == exact_text, "Text at limit should be returned unchanged" # Test text that exceeds limit (with spaces so it can be split) long_text = "This is a very long text that should definitely exceed the character limit when processed. " * 3 # Text that exceeds the limit chunks = chunker.chunk_text(long_text) assert len(chunks) > 1, f"Expected multiple chunks for long text, got {len(chunks)}" for chunk in chunks: assert chunker.estimate_token_count(chunk) <= chunker.max_tokens, f"Chunk exceeds limit: {len(chunk)} > {chunker.max_tokens}" print("✓ Edge cases test passed\n") if __name__ == "__main__": try: test_basic_chunking() test_prompt_chunking() test_priority_chunking() test_edge_cases() print("🎉 All tests passed! Text chunking functionality is working correctly.") except Exception as e: print(f"❌ Test failed: {e}") sys.exit(1)