2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								#!/usr/bin/env python3  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								""" 
  
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								Test  script  to  verify  that  the  text  chunker  fixes  the  token  sequence  length  issues .  
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								""" 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  sys  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  os  
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								sys . path . append ( os . path . dirname ( os . path . abspath ( __file__ ) ) )  
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								from  lib . text_chunker  import  chunk_prompt_for_clip ,  CLIPTextChunker  
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								def  test_long_prompt_chunking ( ) :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    """ Test that long prompts are properly chunked within CLIP token limits. """ 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Create a sample long prompt similar to what the app generates 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    test_prompt  =  " Ultra-realistic close-up headshot of a Medium Brown skinned male soccer player with a plain background looking at the camera with his whole head in shot. The player is twenty-five years old, from United Kingdom, with clean-shaven and Medium Length Brown curly hair. He is facing the camera with a confident expression, wearing a soccer jersey. The lighting is natural and soft, emphasizing facial features and skin texture " 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    print ( f " Original prompt length:  { len ( test_prompt ) }  characters " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    print ( f " Original prompt:  { test_prompt } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    print ( " - "  *  80 ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Test the chunking 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 17:08:52 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    chunker  =  CLIPTextChunker ( max_tokens = 60 ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    chunks  =  chunk_prompt_for_clip ( test_prompt ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    print ( f " Number of chunks:  { len ( chunks ) } " ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    for  i ,  chunk  in  enumerate ( chunks ) : 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        token_count  =  chunker . get_token_count ( chunk ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( f " \n Chunk  { i + 1 } : " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( f "   Text:  { chunk } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( f "   Token count:  { token_count } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( f "   Character count:  { len ( chunk ) } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        if  token_count  >  77 : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            print ( f "   ❌ ERROR: Chunk  { i + 1 }  exceeds CLIP ' s 77 token limit! " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            return  False 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 17:08:52 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        elif  token_count  >  60 : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            print ( f "   ⚠️  WARNING: Chunk  { i + 1 }  is close to the 60 token limit " ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            print ( f "   ✅ Chunk  { i + 1 }  is within safe limits " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    print ( " - "  *  80 ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    print ( " ✅ All chunks are within CLIP ' s token limits! " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return  True 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								def  test_edge_cases ( ) :  
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    """ Test edge cases for the chunking functionality. """ 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 17:08:52 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    chunker  =  CLIPTextChunker ( max_tokens = 60 ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Test empty string 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    chunks  =  chunker . chunk_text ( " " ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    assert  chunks  ==  [ ] ,  " Empty string should return empty list " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Test short string 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    short_text  =  " Hello world " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    chunks  =  chunker . chunk_text ( short_text ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    assert  len ( chunks )  ==  1  and  chunks [ 0 ]  ==  short_text ,  " Short text should not be chunked " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Test very long single word (edge case) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    long_word  =  " a "  *  200 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    chunks  =  chunker . chunk_text ( long_word ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Should handle this gracefully 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    for  chunk  in  chunks : 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 17:08:52 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        assert  chunker . get_token_count ( chunk )  < =  60 ,  " Long word chunks should respect token limit " 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    print ( " ✅ Edge case tests passed! " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return  True 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								if  __name__  ==  " __main__ " :  
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    print ( " Testing text chunker fixes... " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    print ( " = "  *  80 ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    success1  =  test_long_prompt_chunking ( ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    success2  =  test_edge_cases ( ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 16:00:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  success1  and  success2 : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( " \n 🎉 All tests passed! The token sequence length issue should be fixed. " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        sys . exit ( 0 ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( " \n ❌ Some tests failed. The issue may not be fully resolved. " ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-09-23 15:30:50 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        sys . exit ( 1 )