OpenCLIP supports multiple tokenization strategies to handle different text encoders and languages.
Quick Start
import open_clip
# Get tokenizer for model
tokenizer = open_clip.get_tokenizer('ViT-B-32')
# Tokenize text
texts = ["a photo of a cat", "a photo of a dog"]
tokens = tokenizer(texts)
print(tokens.shape) # [2, 77] - batch_size x context_length
get_tokenizer()
Automatically selects the appropriate tokenizer based on model configuration:
import open_clip
tokenizer = open_clip.get_tokenizer(
model_name='ViT-B-32',
context_length=77, # Override default
cache_dir='/path/to/cache' # For HuggingFace tokenizers
)
Model identifier. Can use schemas:
- Built-in:
'ViT-B-32'
- HuggingFace:
'hf-hub:org/repo'
- Local:
'local-dir:/path/to/model'
Maximum sequence length. Defaults to model’s configured length (usually 77).
Cache directory for downloading HuggingFace tokenizers
Tokenizer Types
SimpleTokenizer
Default BPE tokenizer used by most CLIP models:
from open_clip.tokenizer import SimpleTokenizer
tokenizer = SimpleTokenizer(
context_length=77,
clean='lower' # Text cleaning: 'lower', 'whitespace', or 'canonicalize'
)
# Tokenize
tokens = tokenizer(["Hello world", "OpenCLIP"])
print(tokens.shape) # [2, 77]
# Decode
text = tokenizer.decode(tokens[0])
print(text)
Maximum sequence length (including special tokens)
Text preprocessing:
'lower': Lowercase + whitespace cleaning
'whitespace': Whitespace cleaning only
'canonicalize': Remove punctuation + lowercase
Token reduction strategy when exceeding context length:
'simple': Random contiguous block
'random': Random tokens (preserve order)
'shuffle': Random tokens (shuffle)
'syntax': Priority based on POS tags
HFTokenizer
HuggingFace Transformers tokenizer wrapper for models using pretrained LMs:
from open_clip.tokenizer import HFTokenizer
# Use RoBERTa tokenizer
tokenizer = HFTokenizer(
'roberta-base',
context_length=77,
clean='whitespace',
cache_dir='/path/to/cache'
)
tokens = tokenizer(["Example text"])
HuggingFace tokenizer identifier (e.g., ‘roberta-base’, ‘xlm-roberta-large’)
Remove separator tokens from output
Language code for multilingual tokenizers (e.g., ‘en’, ‘fr’, ‘de’)
SigLipTokenizer
SentencePiece tokenizer for SigLIP models:
from open_clip.tokenizer import SigLipTokenizer
# Different variants
tokenizer = SigLipTokenizer(
'c4-en', # Options: 'c4-en', 'mc4', 'gemma'
context_length=64 # SigLIP uses 64 by default
)
tokens = tokenizer(["Sample text"])
Variants:
'c4-en': English only (vocab_size=32,000)
'mc4': Multilingual (vocab_size=250,000)
'gemma': SigLIP2 models (vocab_size=256,000)
Context Length
Default Context Lengths
from open_clip.tokenizer import DEFAULT_CONTEXT_LENGTH
print(DEFAULT_CONTEXT_LENGTH) # 77 for CLIP models
Different models use different context lengths:
- CLIP models: 77 tokens
- SigLIP models: 64 tokens
- CoCa models: 76 tokens (+ 1 for generation)
Handling Long Text
tokenizer = open_clip.get_tokenizer('ViT-B-32')
# Text longer than 77 tokens will be truncated
long_text = "This is a very long description that exceeds 77 tokens..." * 10
tokens = tokenizer([long_text])
print(tokens.shape) # [1, 77] - truncated to context_length
Custom Context Length
# Create model with custom context length
model = open_clip.create_model(
'ViT-B-32',
pretrained='laion2b_s34b_b79k',
force_context_length=128 # Increase from default 77
)
# Get matching tokenizer
tokenizer = open_clip.get_tokenizer('ViT-B-32', context_length=128)
Changing context length requires model weights trained with that length. For pretrained models, use the original context length.
Text Preprocessing
Cleaning Modes
from open_clip.tokenizer import SimpleTokenizer
# Lowercase + whitespace cleaning (default)
lower_tokenizer = SimpleTokenizer(clean='lower')
tokens = lower_tokenizer(["Hello World!"]) # -> "hello world!"
# Whitespace only
whitespace_tokenizer = SimpleTokenizer(clean='whitespace')
tokens = whitespace_tokenizer(["Hello World!"]) # -> "Hello World!"
# Canonicalize (remove punctuation + lowercase)
canon_tokenizer = SimpleTokenizer(clean='canonicalize')
tokens = canon_tokenizer(["Hello, World!"]) # -> "hello world"
Special Tokens
SimpleTokenizer uses special tokens:
<start_of_text> (token_id: 49406)
<end_of_text> (token_id: 49407)
tokenizer = SimpleTokenizer()
print(tokenizer.sot_token_id) # 49406
print(tokenizer.eot_token_id) # 49407
print(tokenizer.vocab_size) # 49408
# Manual encoding
tokens = tokenizer.encode("hello")
print(tokens) # [...] token IDs without special tokens
# Full tokenization (with special tokens)
tokens = tokenizer(["hello"])
print(tokens[0]) # [49406, ..., 49407, 0, 0, ...] # SOT, tokens, EOT, padding
Multilingual Tokenization
XLM-RoBERTa Models
import open_clip
# Load multilingual model
model, _, preprocess = open_clip.create_model_and_transforms(
'xlm-roberta-base-ViT-B-32',
pretrained='laion5b_s13b_b90k'
)
tokenizer = open_clip.get_tokenizer('xlm-roberta-base-ViT-B-32')
# Supports 100+ languages
texts = [
"a photo of a cat", # English
"une photo d'un chat", # French
"ein Foto von einer Katze", # German
"猫の写真" # Japanese
]
tokens = tokenizer(texts)
SigLIP Multilingual
import open_clip
model, _, preprocess = open_clip.create_model_and_transforms(
'ViT-B-16-SigLIP-i18n-256',
pretrained='webli'
)
tokenizer = open_clip.get_tokenizer('ViT-B-16-SigLIP-i18n-256')
# Multilingual SigLIP tokenizer
tokens = tokenizer(["text in various languages"])
Advanced Usage
Batch Tokenization
import open_clip
tokenizer = open_clip.get_tokenizer('ViT-B-32')
# Large batch
texts = [f"description {i}" for i in range(1000)]
# Process in batches for memory efficiency
batch_size = 256
all_tokens = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
tokens = tokenizer(batch)
all_tokens.append(tokens)
import torch
all_tokens = torch.cat(all_tokens, dim=0)
print(all_tokens.shape) # [1000, 77]
Custom Vocabulary
from open_clip.tokenizer import SimpleTokenizer
# Add custom special tokens
tokenizer = SimpleTokenizer(
additional_special_tokens=['<custom_token>']
)
# Access custom token ID
custom_id = tokenizer.encoder['<custom_token>']
Decoding Tokens
tokenizer = open_clip.get_tokenizer('ViT-B-32')
# Tokenize
original = ["a photo of a cat"]
tokens = tokenizer(original)
# Decode
from open_clip import decode
decoded = decode(tokens[0])
print(decoded) # "<start_of_text>a photo of a cat<end_of_text>"
# Clean decoded text
cleaned = decoded.replace("<start_of_text>", "").replace("<end_of_text>", "").strip()
print(cleaned) # "a photo of a cat"
Complete Example
import torch
import open_clip
from PIL import Image
# Load model and tokenizer
model, _, preprocess = open_clip.create_model_and_transforms(
'ViT-L-14',
pretrained='datacomp_xl_s13b_b90k',
device='cuda'
)
model.eval()
tokenizer = open_clip.get_tokenizer('ViT-L-14')
# Prepare inputs
image = preprocess(Image.open('cat.jpg')).unsqueeze(0).cuda()
# Create text descriptions with templates
labels = ['cat', 'dog', 'bird']
texts = [f"a photo of a {label}" for label in labels]
tokens = tokenizer(texts).cuda()
print("Token shape:", tokens.shape) # [3, 77]
print("Context length:", tokenizer.context_length) # 77
# Inference
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(tokens)
# Normalize
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
# Compute similarity
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
predicted_idx = similarity.argmax().item()
print(f"\nPredicted: {labels[predicted_idx]}")
print(f"Confidence: {similarity[0, predicted_idx]:.2%}")
print(f"\nAll probabilities:")
for label, prob in zip(labels, similarity[0]):
print(f" {label}: {prob:.2%}")
The tokenizer is automatically selected based on model configuration. For most CLIP models, this will be SimpleTokenizer. Models using HuggingFace text encoders will use HFTokenizer.