Skip to main content
Creates and configures a contrastive vision-language model with flexible weight loading options.

Signature

def create_model(
    model_name: str,
    pretrained: Optional[str] = None,
    load_weights: bool = True,
    precision: str = 'fp32',
    device: Union[str, torch.device] = 'cpu',
    jit: bool = False,
    force_quick_gelu: bool = False,
    force_custom_text: bool = False,
    force_patch_dropout: Optional[float] = None,
    force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
    force_preprocess_cfg: Optional[Dict[str, Any]] = None,
    force_context_length: Optional[int] = None,
    pretrained_image: bool = False,
    pretrained_text: bool = True,
    pretrained_image_path: Optional[str] = None,
    pretrained_text_path: Optional[str] = None,
    cache_dir: Optional[str] = None,
    output_dict: Optional[bool] = None,
    require_pretrained: bool = False,
    weights_only: bool = True,
    **model_kwargs,
) -> torch.nn.Module:
    ...

Parameters

model_name
str
required
Model identifier, potentially with schema prefix:
  • 'ViT-B-32': Built-in model name. pretrained specifies CLIP weights source (tag or file path).
  • 'hf-hub:org/repo': Loads config/weights from HuggingFace Hub. pretrained is IGNORED.
  • 'local-dir:/path/to/folder': Loads config/weights from local directory. pretrained is IGNORED.
pretrained
Optional[str]
default:"None"
Source for CLIP weights (tag or file path) ONLY if model_name has no schema. Can be a pretrained tag like 'openai', 'laion400m_e32', or a path to a checkpoint file.
load_weights
bool
default:"True"
Load the resolved pretrained weights if True, otherwise random init or tower overrides only.
precision
str
default:"'fp32'"
Model precision. Options: 'fp32', 'fp16', 'bf16', 'pure_fp16', 'pure_bf16'.
device
Union[str, torch.device]
default:"'cpu'"
Device to load model on. Can be 'cpu', 'cuda', or a torch.device object.
jit
bool
default:"False"
If True, JIT compile the model using torch.jit.script.
force_quick_gelu
bool
default:"False"
Force use of QuickGELU activation in model config instead of standard GELU.
force_custom_text
bool
default:"False"
Force use of custom text encoder architecture (CustomTextCLIP).
force_patch_dropout
Optional[float]
default:"None"
Override patch dropout value in model config. Values typically range from 0.0 to 1.0.
force_image_size
Optional[Union[int, Tuple[int, int]]]
default:"None"
Override image size in model config. Can be a single int (square) or tuple (height, width).
force_preprocess_cfg
Optional[Dict[str, Any]]
default:"None"
Dictionary to override specific preprocessing parameters (mean, std, interpolation, resize_mode).
force_context_length
Optional[int]
default:"None"
Override context length (max sequence length) in text config.
pretrained_image
bool
default:"False"
Load default base weights for image tower at creation if no CLIP weights loaded. Only effective for timm-based vision models.
pretrained_text
bool
default:"True"
Load default base weights for text tower at creation if no CLIP weights loaded. Only effective for HuggingFace-based text models.
pretrained_image_path
Optional[str]
default:"None"
Path to load weights specifically into image tower after model creation. Loads after full CLIP checkpoint.
pretrained_text_path
Optional[str]
default:"None"
Path to load weights specifically into text tower after model creation. Loads after full CLIP checkpoint.
cache_dir
Optional[str]
default:"None"
Cache directory for downloaded weights. Defaults to ~/.cache/clip.
output_dict
Optional[bool]
default:"None"
If True and model supports it, return dictionary output instead of tensors.
require_pretrained
bool
default:"False"
Raise error if no pretrained CLIP weights loaded when required.
weights_only
bool
default:"True"
Use weights_only=True for torch.load (safer, prevents arbitrary code execution).
**model_kwargs
Any
Additional keyword arguments for model constructor (highest override priority).

Returns

model
torch.nn.Module
The created model instance (CLIP, CustomTextCLIP, or CoCa depending on configuration).

Example

import open_clip

# Create model with OpenAI pretrained weights
model = open_clip.create_model('ViT-B-32', pretrained='openai')

# Create model from HuggingFace Hub
model = open_clip.create_model('hf-hub:laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K')

# Create model with custom image size
model = open_clip.create_model(
    'ViT-L-14',
    pretrained='datacomp_xl_s13b_b90k',
    force_image_size=336,
    device='cuda',
    precision='fp16'
)

# Create model from local directory
model = open_clip.create_model('local-dir:/path/to/model')

# Create model with separate tower weights
model = open_clip.create_model(
    'ViT-B-32',
    pretrained_image_path='/path/to/image_weights.pt',
    pretrained_text_path='/path/to/text_weights.pt'
)

Build docs developers (and LLMs) love