mirror of
https://github.com/karl0ss/comfy_fm24_newgens.git
synced 2025-10-03 06:40:06 +01:00
text chunking
This commit is contained in:
parent
fd999ec1e6
commit
650369b06f
@ -8,12 +8,12 @@ class CLIPTextChunker:
|
||||
Using a conservative limit of 70 tokens to account for special tokens.
|
||||
"""
|
||||
|
||||
def __init__(self, max_tokens: int = 70):
|
||||
def __init__(self, max_tokens: int = 40):
|
||||
"""
|
||||
Initialize the text chunker.
|
||||
|
||||
Args:
|
||||
max_tokens (int): Maximum number of tokens per chunk (default: 70 for CLIP, being conservative)
|
||||
max_tokens (int): Maximum number of tokens per chunk (default: 40 for CLIP, being very conservative)
|
||||
"""
|
||||
self.max_tokens = max_tokens
|
||||
self._tokenizer = None
|
||||
@ -24,8 +24,9 @@ class CLIPTextChunker:
|
||||
if self._tokenizer is None:
|
||||
try:
|
||||
from transformers import CLIPTokenizer
|
||||
self._tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
||||
except ImportError:
|
||||
# Use a simpler model that should be more reliable
|
||||
self._tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", local_files_only=False)
|
||||
except Exception as e:
|
||||
# Fallback to character-based estimation if transformers not available
|
||||
self._tokenizer = None
|
||||
return self._tokenizer
|
||||
@ -42,7 +43,8 @@ class CLIPTextChunker:
|
||||
"""
|
||||
if self.tokenizer is None:
|
||||
# Fallback to character count if tokenizer not available
|
||||
return len(text)
|
||||
# Use a very conservative estimate: ~0.6 characters per token for CLIP
|
||||
return int(len(text) * 0.6)
|
||||
|
||||
tokens = self.tokenizer(
|
||||
text,
|
||||
@ -153,14 +155,14 @@ class CLIPTextChunker:
|
||||
# Fallback to regular chunking
|
||||
return self.chunk_text(text)
|
||||
|
||||
def chunk_prompt_for_clip(prompt: str, max_tokens: int = 70) -> List[str]:
|
||||
def chunk_prompt_for_clip(prompt: str, max_tokens: int = 40) -> List[str]:
|
||||
"""
|
||||
Convenience function to chunk a prompt for CLIP processing.
|
||||
Uses a conservative 70 token limit to be safe.
|
||||
Uses a conservative 40 token limit to be safe.
|
||||
|
||||
Args:
|
||||
prompt (str): The prompt to chunk
|
||||
max_tokens (int): Maximum tokens per chunk (default: 70 for safety)
|
||||
max_tokens (int): Maximum tokens per chunk (default: 40 for safety)
|
||||
|
||||
Returns:
|
||||
List[str]: List of prompt chunks
|
||||
|
Loading…
x
Reference in New Issue
Block a user