2025-09-23 15:30:50 +01:00
#!/usr/bin/env python3
"""
2025-09-23 16:00:27 +01:00
Test script to verify that the text chunker fixes the token sequence length issues .
2025-09-23 15:30:50 +01:00
"""
import sys
import os
2025-09-23 16:00:27 +01:00
sys . path . append ( os . path . dirname ( os . path . abspath ( __file__ ) ) )
2025-09-23 15:30:50 +01:00
2025-09-23 16:00:27 +01:00
from lib . text_chunker import chunk_prompt_for_clip , CLIPTextChunker
2025-09-23 15:30:50 +01:00
2025-09-23 16:00:27 +01:00
def test_long_prompt_chunking ( ) :
""" Test that long prompts are properly chunked within CLIP token limits. """
2025-09-23 15:30:50 +01:00
2025-09-23 16:00:27 +01:00
# Create a sample long prompt similar to what the app generates
test_prompt = " Ultra-realistic close-up headshot of a Medium Brown skinned male soccer player with a plain background looking at the camera with his whole head in shot. The player is twenty-five years old, from United Kingdom, with clean-shaven and Medium Length Brown curly hair. He is facing the camera with a confident expression, wearing a soccer jersey. The lighting is natural and soft, emphasizing facial features and skin texture "
2025-09-23 15:30:50 +01:00
2025-09-23 16:00:27 +01:00
print ( f " Original prompt length: { len ( test_prompt ) } characters " )
print ( f " Original prompt: { test_prompt } " )
print ( " - " * 80 )
2025-09-23 15:30:50 +01:00
2025-09-23 16:00:27 +01:00
# Test the chunking
2025-09-23 17:08:52 +01:00
chunker = CLIPTextChunker ( max_tokens = 60 )
2025-09-23 16:00:27 +01:00
chunks = chunk_prompt_for_clip ( test_prompt )
2025-09-23 15:30:50 +01:00
2025-09-23 16:00:27 +01:00
print ( f " Number of chunks: { len ( chunks ) } " )
2025-09-23 15:30:50 +01:00
for i , chunk in enumerate ( chunks ) :
2025-09-23 16:00:27 +01:00
token_count = chunker . get_token_count ( chunk )
print ( f " \n Chunk { i + 1 } : " )
print ( f " Text: { chunk } " )
print ( f " Token count: { token_count } " )
print ( f " Character count: { len ( chunk ) } " )
if token_count > 77 :
print ( f " ❌ ERROR: Chunk { i + 1 } exceeds CLIP ' s 77 token limit! " )
return False
2025-09-23 17:08:52 +01:00
elif token_count > 60 :
print ( f " ⚠️ WARNING: Chunk { i + 1 } is close to the 60 token limit " )
2025-09-23 16:00:27 +01:00
else :
print ( f " ✅ Chunk { i + 1 } is within safe limits " )
print ( " - " * 80 )
print ( " ✅ All chunks are within CLIP ' s token limits! " )
return True
2025-09-23 15:30:50 +01:00
def test_edge_cases ( ) :
2025-09-23 16:00:27 +01:00
""" Test edge cases for the chunking functionality. """
2025-09-23 15:30:50 +01:00
2025-09-23 17:08:52 +01:00
chunker = CLIPTextChunker ( max_tokens = 60 )
2025-09-23 15:30:50 +01:00
2025-09-23 16:00:27 +01:00
# Test empty string
2025-09-23 15:30:50 +01:00
chunks = chunker . chunk_text ( " " )
2025-09-23 16:00:27 +01:00
assert chunks == [ ] , " Empty string should return empty list "
# Test short string
short_text = " Hello world "
chunks = chunker . chunk_text ( short_text )
assert len ( chunks ) == 1 and chunks [ 0 ] == short_text , " Short text should not be chunked "
# Test very long single word (edge case)
long_word = " a " * 200
chunks = chunker . chunk_text ( long_word )
# Should handle this gracefully
2025-09-23 15:30:50 +01:00
for chunk in chunks :
2025-09-23 17:08:52 +01:00
assert chunker . get_token_count ( chunk ) < = 60 , " Long word chunks should respect token limit "
2025-09-23 15:30:50 +01:00
2025-09-23 16:00:27 +01:00
print ( " ✅ Edge case tests passed! " )
return True
2025-09-23 15:30:50 +01:00
if __name__ == " __main__ " :
2025-09-23 16:00:27 +01:00
print ( " Testing text chunker fixes... " )
print ( " = " * 80 )
2025-09-23 15:30:50 +01:00
2025-09-23 16:00:27 +01:00
success1 = test_long_prompt_chunking ( )
success2 = test_edge_cases ( )
2025-09-23 15:30:50 +01:00
2025-09-23 16:00:27 +01:00
if success1 and success2 :
print ( " \n 🎉 All tests passed! The token sequence length issue should be fixed. " )
sys . exit ( 0 )
else :
print ( " \n ❌ Some tests failed. The issue may not be fully resolved. " )
2025-09-23 15:30:50 +01:00
sys . exit ( 1 )