comfy_fm24_newgens/lib/text_chunker.py

import re
from typing import List, Optional

class CLIPTextChunker:
    """
    Utility class for chunking text to fit within CLIP's token limits.
    CLIP models typically have a maximum sequence length of 77 tokens.
    Using a conservative limit of 60 tokens to account for special tokens.
    """

    def __init__(self, max_tokens: int = 60):
        """
        Initialize the text chunker.

        Args:
            max_tokens (int): Maximum number of tokens per chunk (default: 60 for CLIP, being conservative)
        """
        self.max_tokens = max_tokens

    def estimate_token_count(self, text: str) -> int:
        """
        Estimate the number of tokens in a text string.
        Uses character count as a simple proxy for token count.

        Args:
            text (str): Input text

        Returns:
            int: Estimated token count (using character count as proxy)
        """
        # Simple approach: use character count as a proxy for token count
        # This is much more reliable than trying to estimate actual tokens
        return len(text)

    def chunk_text(self, text: str, preserve_sentences: bool = True) -> List[str]:
        """
        Chunk text into smaller pieces that fit within the token limit.
        Uses character count as a simple and reliable approach.

        Args:
            text (str): Input text to chunk
            preserve_sentences (bool): Whether to try to preserve sentence boundaries

        Returns:
            List[str]: List of text chunks
        """
        if not text.strip():
            return []

        if self.estimate_token_count(text) <= self.max_tokens:
            return [text]

        chunks = []
        words = text.split()
        current_chunk = []
        current_length = 0

        for word in words:
            word_with_space = word + " "

            # If adding this word would exceed the limit, start a new chunk
            if current_length + len(word_with_space) > self.max_tokens and current_chunk:
                # Join the current chunk and add it
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]
                current_length = len(word_with_space)
            else:
                current_chunk.append(word)
                current_length += len(word_with_space)

        # Add the last chunk if it exists
        if current_chunk:
            chunks.append(" ".join(current_chunk))

        return chunks

    def create_priority_chunks(self, text: str, essential_info: List[str]) -> List[str]:
        """
        Create chunks with priority given to essential information.

        Args:
            text (str): Full text to chunk
            essential_info (List[str]): List of essential phrases that should be preserved

        Returns:
            List[str]: List of prioritized chunks
        """
        # First, try to create chunks that include essential information
        essential_chunks = []

        for info in essential_info:
            if info in text:
                # Create a chunk focused on this essential info
                info_index = text.find(info)
                start = max(0, info_index - 50)  # Include some context before
                end = min(len(text), info_index + len(info) + 50)  # Include some context after
                context = text[start:end]

                chunk = self.chunk_text(context)[0]  # Take the first (most relevant) chunk
                if chunk not in essential_chunks:
                    essential_chunks.append(chunk)

        # If we have too many essential chunks, combine them
        if len(essential_chunks) > 1:
            combined = " ".join(essential_chunks)
            if self.estimate_token_count(combined) <= self.max_tokens:
                return [combined]
            else:
                # Need to reduce the combined chunk
                return self.chunk_text(combined)

        return essential_chunks if essential_chunks else self.chunk_text(text)

def chunk_prompt_for_clip(prompt: str, max_tokens: int = 60) -> List[str]:
    """
    Convenience function to chunk a prompt for CLIP processing.
    Uses a conservative 60 token limit to be safe.

    Args:
        prompt (str): The prompt to chunk
        max_tokens (int): Maximum tokens per chunk (default: 60 for safety)

    Returns:
        List[str]: List of prompt chunks
    """
    chunker = CLIPTextChunker(max_tokens=max_tokens)

    # Define essential information that should be preserved
    essential_info = [
        "Ultra-realistic close-up headshot",
        "male soccer player",
        "looking at the camera",
        "facing the camera",
        "confident expression",
        "soccer jersey"
    ]

    return chunker.create_priority_chunks(prompt, essential_info)
fixed clip warning 2025-09-23 15:30:50 +01:00			`import re`
			`from typing import List, Optional`

			`class CLIPTextChunker:`
			`"""`
			`Utility class for chunking text to fit within CLIP's token limits.`
			`CLIP models typically have a maximum sequence length of 77 tokens.`
			`Using a conservative limit of 60 tokens to account for special tokens.`
			`"""`

			`def __init__(self, max_tokens: int = 60):`
			`"""`
			`Initialize the text chunker.`

			`Args:`
			`max_tokens (int): Maximum number of tokens per chunk (default: 60 for CLIP, being conservative)`
			`"""`
			`self.max_tokens = max_tokens`

			`def estimate_token_count(self, text: str) -> int:`
			`"""`
			`Estimate the number of tokens in a text string.`
			`Uses character count as a simple proxy for token count.`

			`Args:`
			`text (str): Input text`

			`Returns:`
			`int: Estimated token count (using character count as proxy)`
			`"""`
			`# Simple approach: use character count as a proxy for token count`
			`# This is much more reliable than trying to estimate actual tokens`
			`return len(text)`

			`def chunk_text(self, text: str, preserve_sentences: bool = True) -> List[str]:`
			`"""`
			`Chunk text into smaller pieces that fit within the token limit.`
			`Uses character count as a simple and reliable approach.`

			`Args:`
			`text (str): Input text to chunk`
			`preserve_sentences (bool): Whether to try to preserve sentence boundaries`

			`Returns:`
			`List[str]: List of text chunks`
			`"""`
			`if not text.strip():`
			`return []`

			`if self.estimate_token_count(text) <= self.max_tokens:`
			`return [text]`

			`chunks = []`
			`words = text.split()`
			`current_chunk = []`
			`current_length = 0`

			`for word in words:`
			`word_with_space = word + " "`

			`# If adding this word would exceed the limit, start a new chunk`
			`if current_length + len(word_with_space) > self.max_tokens and current_chunk:`
			`# Join the current chunk and add it`
			`chunks.append(" ".join(current_chunk))`
			`current_chunk = [word]`
			`current_length = len(word_with_space)`
			`else:`
			`current_chunk.append(word)`
			`current_length += len(word_with_space)`

			`# Add the last chunk if it exists`
			`if current_chunk:`
			`chunks.append(" ".join(current_chunk))`

			`return chunks`

			`def create_priority_chunks(self, text: str, essential_info: List[str]) -> List[str]:`
			`"""`
			`Create chunks with priority given to essential information.`

			`Args:`
			`text (str): Full text to chunk`
			`essential_info (List[str]): List of essential phrases that should be preserved`

			`Returns:`
			`List[str]: List of prioritized chunks`
			`"""`
			`# First, try to create chunks that include essential information`
			`essential_chunks = []`

			`for info in essential_info:`
			`if info in text:`
			`# Create a chunk focused on this essential info`
			`info_index = text.find(info)`
			`start = max(0, info_index - 50) # Include some context before`
			`end = min(len(text), info_index + len(info) + 50) # Include some context after`
			`context = text[start:end]`

			`chunk = self.chunk_text(context)[0] # Take the first (most relevant) chunk`
			`if chunk not in essential_chunks:`
			`essential_chunks.append(chunk)`

			`# If we have too many essential chunks, combine them`
			`if len(essential_chunks) > 1:`
			`combined = " ".join(essential_chunks)`
			`if self.estimate_token_count(combined) <= self.max_tokens:`
			`return [combined]`
			`else:`
			`# Need to reduce the combined chunk`
			`return self.chunk_text(combined)`

			`return essential_chunks if essential_chunks else self.chunk_text(text)`

			`def chunk_prompt_for_clip(prompt: str, max_tokens: int = 60) -> List[str]:`
			`"""`
			`Convenience function to chunk a prompt for CLIP processing.`
			`Uses a conservative 60 token limit to be safe.`

			`Args:`
			`prompt (str): The prompt to chunk`
			`max_tokens (int): Maximum tokens per chunk (default: 60 for safety)`

			`Returns:`
			`List[str]: List of prompt chunks`
			`"""`
			`chunker = CLIPTextChunker(max_tokens=max_tokens)`

			`# Define essential information that should be preserved`
			`essential_info = [`
			`"Ultra-realistic close-up headshot",`
			`"male soccer player",`
			`"looking at the camera",`
			`"facing the camera",`
			`"confident expression",`
			`"soccer jersey"`
			`]`

			`return chunker.create_priority_chunks(prompt, essential_info)`