create_model

Creates and configures a contrastive vision-language model with flexible weight loading options.

Signature

def create_model(
    model_name: str,
    pretrained: Optional[str] = None,
    load_weights: bool = True,
    precision: str = 'fp32',
    device: Union[str, torch.device] = 'cpu',
    jit: bool = False,
    force_quick_gelu: bool = False,
    force_custom_text: bool = False,
    force_patch_dropout: Optional[float] = None,
    force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
    force_preprocess_cfg: Optional[Dict[str, Any]] = None,
    force_context_length: Optional[int] = None,
    pretrained_image: bool = False,
    pretrained_text: bool = True,
    pretrained_image_path: Optional[str] = None,
    pretrained_text_path: Optional[str] = None,
    cache_dir: Optional[str] = None,
    output_dict: Optional[bool] = None,
    require_pretrained: bool = False,
    weights_only: bool = True,
    **model_kwargs,
) -> torch.nn.Module:
    ...

Parameters

model_name

str

required

Model identifier, potentially with schema prefix:

'ViT-B-32': Built-in model name. pretrained specifies CLIP weights source (tag or file path).
'hf-hub:org/repo': Loads config/weights from HuggingFace Hub. pretrained is IGNORED.
'local-dir:/path/to/folder': Loads config/weights from local directory. pretrained is IGNORED.

pretrained

Optional[str]

default:"None"

Source for CLIP weights (tag or file path) ONLY if model_name has no schema. Can be a pretrained tag like 'openai', 'laion400m_e32', or a path to a checkpoint file.

load_weights

bool

default:"True"

Load the resolved pretrained weights if True, otherwise random init or tower overrides only.

precision

str

default:"'fp32'"

Model precision. Options: 'fp32', 'fp16', 'bf16', 'pure_fp16', 'pure_bf16'.

device

Union[str, torch.device]

default:"'cpu'"

Device to load model on. Can be 'cpu', 'cuda', or a torch.device object.

jit

bool

default:"False"

If True, JIT compile the model using torch.jit.script.

force_quick_gelu

bool

default:"False"

Force use of QuickGELU activation in model config instead of standard GELU.

force_custom_text

bool

default:"False"

Force use of custom text encoder architecture (CustomTextCLIP).

force_patch_dropout

Optional[float]

default:"None"

Override patch dropout value in model config. Values typically range from 0.0 to 1.0.

force_image_size

Optional[Union[int, Tuple[int, int]]]

default:"None"

Override image size in model config. Can be a single int (square) or tuple (height, width).

force_preprocess_cfg

Optional[Dict[str, Any]]

default:"None"

Dictionary to override specific preprocessing parameters (mean, std, interpolation, resize_mode).

force_context_length

Optional[int]

default:"None"

Override context length (max sequence length) in text config.

pretrained_image

bool

default:"False"

Load default base weights for image tower at creation if no CLIP weights loaded. Only effective for timm-based vision models.

pretrained_text

bool

default:"True"

Load default base weights for text tower at creation if no CLIP weights loaded. Only effective for HuggingFace-based text models.

pretrained_image_path

Optional[str]

default:"None"

Path to load weights specifically into image tower after model creation. Loads after full CLIP checkpoint.

pretrained_text_path

Optional[str]

default:"None"

Path to load weights specifically into text tower after model creation. Loads after full CLIP checkpoint.

cache_dir

Optional[str]

default:"None"

Cache directory for downloaded weights. Defaults to ~/.cache/clip.

output_dict

Optional[bool]

default:"None"

If True and model supports it, return dictionary output instead of tensors.

require_pretrained

bool

default:"False"

Raise error if no pretrained CLIP weights loaded when required.

weights_only

bool

default:"True"

Use weights_only=True for torch.load (safer, prevents arbitrary code execution).

**model_kwargs

Any

Additional keyword arguments for model constructor (highest override priority).

Returns

model

torch.nn.Module

The created model instance (CLIP, CustomTextCLIP, or CoCa depending on configuration).

Example

import open_clip

# Create model with OpenAI pretrained weights
model = open_clip.create_model('ViT-B-32', pretrained='openai')

# Create model from HuggingFace Hub
model = open_clip.create_model('hf-hub:laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K')

# Create model with custom image size
model = open_clip.create_model(
    'ViT-L-14',
    pretrained='datacomp_xl_s13b_b90k',
    force_image_size=336,
    device='cuda',
    precision='fp16'
)

# Create model from local directory
model = open_clip.create_model('local-dir:/path/to/model')

# Create model with separate tower weights
model = open_clip.create_model(
    'ViT-B-32',
    pretrained_image_path='/path/to/image_weights.pt',
    pretrained_text_path='/path/to/text_weights.pt'
)

Model Creation

Pretrained Models

Tokenization

Transforms

Model Classes

Loss Functions

Zero-Shot

Signature

Parameters

Returns

Example

Build docs developers (and LLMs) love

Model Creation

Pretrained Models

Tokenization

Transforms

Model Classes

Loss Functions

Zero-Shot

​Signature

​Parameters

​Returns

​Example

Build docs developers (and LLMs) love

Signature

Parameters

Returns

Example