Skip to content

Commit

Permalink
Add MobileCLIP-B & conversion. Add ViTamin configs. Some refactoring …
Browse files Browse the repository at this point in the history
…of transformer module.

* Move NLD -> LND transpose into Transformer module forward().
* Started working on CustomTransformer for MobileCLIP-S0 text-tower but scope too large. Leaving CustomTransformer in for potential use in future.
  • Loading branch information
rwightman committed Jun 7, 2024
1 parent 1d7b953 commit 8cf653a
Show file tree
Hide file tree
Showing 20 changed files with 476 additions and 44 deletions.
17 changes: 11 additions & 6 deletions src/open_clip/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,14 @@ def _convert_openclip_txt(module: TextTransformer, prefix):


@torch.no_grad()
def convert_mobile_clip_state_dict(model: CustomTextCLIP, state_dict):
from timm.models.fastvit import _checkpoint_filter_fn

def _convert_timm_img(state_dict, prefix='image_encoder.'):
timm_state_dict = _checkpoint_filter_fn(state_dict, model.visual.trunk)
def convert_mobile_clip_state_dict(model: CustomTextCLIP, state_dict, fastvit = True):

def _convert_timm_img(state_dict):
if fastvit:
from timm.models.fastvit import checkpoint_filter_fn
else:
from timm.models.vision_transformer_hybrid import checkpoint_filter_fn
timm_state_dict = checkpoint_filter_fn(state_dict, model.visual.trunk)
timm_state_dict = {'visual.trunk.' + k: v for k, v in timm_state_dict.items()}
return timm_state_dict

Expand Down Expand Up @@ -181,5 +184,7 @@ def convert_state_dict(model: Union[CustomTextCLIP, CLIP], state_dict):
if 'image_encoder.model.patch_embed.0.rbr_conv.0.conv.weight' in state_dict:
# Apple MobileCLIP s1 & s2 state_dicts (s0 and b not currently supported)
state_dict = convert_mobile_clip_state_dict(model, state_dict)

if 'image_encoder.model.patch_emb.0.block.conv.weight' in state_dict:
# convert b model
state_dict = convert_mobile_clip_state_dict(model, state_dict, fastvit=False)
return state_dict
2 changes: 0 additions & 2 deletions src/open_clip/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,9 +272,7 @@ def encode_text(self, text, normalize: bool = False):
x = self.token_embedding(text).to(cast_dtype) # [batch_size, n_ctx, d_model]

x = x + self.positional_embedding.to(cast_dtype)
x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x, attn_mask=self.attn_mask)
x = x.permute(1, 0, 2) # LND -> NLD
x = self.ln_final(x) # [batch_size, n_ctx, transformer.width]
x, _ = text_global_pool(x, text, self.text_pool_type)
if self.text_projection is not None:
Expand Down
21 changes: 21 additions & 0 deletions src/open_clip/model_configs/MobileCLIP-B.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"embed_dim": 512,
"vision_cfg": {
"timm_model_name": "vit_base_mci_224",
"timm_model_pretrained": false,
"timm_pool": "token",
"timm_proj": null,
"timm_drop": 0.0,
"timm_drop_path": 0.0,
"image_size": 224
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 512,
"heads": 8,
"layers": 12,
"no_causal_mask": false
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-B-LTT.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 768,
"vision_cfg": {
"timm_model_name": "vitamin_base_224",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 224
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 768,
"heads": 12,
"layers": 12
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-B.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 512,
"vision_cfg": {
"timm_model_name": "vitamin_base_224",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 224
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 512,
"heads": 8,
"layers": 12
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-L-256.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 768,
"vision_cfg": {
"timm_model_name": "vitamin_large_256",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 256
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 768,
"heads": 12,
"layers": 12
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-L-336.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 768,
"vision_cfg": {
"timm_model_name": "vitamin_large_336",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 336
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 768,
"heads": 12,
"layers": 12
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-L.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 768,
"vision_cfg": {
"timm_model_name": "vitamin_large_224",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 224
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 768,
"heads": 12,
"layers": 12
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-L2-256.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 1024,
"vision_cfg": {
"timm_model_name": "vitamin_large2_256",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 256
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 1024,
"heads": 16,
"layers": 24
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-L2-336.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 1024,
"vision_cfg": {
"timm_model_name": "vitamin_large2_336",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 336
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 1024,
"heads": 16,
"layers": 24
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-L2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 1024,
"vision_cfg": {
"timm_model_name": "vitamin_large2_224",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 224
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 1024,
"heads": 16,
"layers": 24
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-S-LTT.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 768,
"vision_cfg": {
"timm_model_name": "vitamin_small_224",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 224
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 768,
"heads": 12,
"layers": 12
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-S.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 384,
"vision_cfg": {
"timm_model_name": "vitamin_small_224",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 224
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 384,
"heads": 6,
"layers": 12
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-XL-256.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 1152,
"vision_cfg": {
"timm_model_name": "vitamin_xlarge_256",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 256
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 1152,
"heads": 16,
"layers": 27
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-XL-336.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 1152,
"vision_cfg": {
"timm_model_name": "vitamin_xlarge_336",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 336
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 1152,
"heads": 16,
"layers": 27
},
"custom_text": true
}
20 changes: 20 additions & 0 deletions src/open_clip/model_configs/ViTamin-XL-384.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"embed_dim": 1152,
"vision_cfg": {
"timm_model_name": "vitamin_xlarge_384",
"timm_model_pretrained": false,
"timm_pool": "",
"timm_proj": "linear",
"timm_drop": 0.0,
"timm_drop_path": 0.1,
"image_size": 256
},
"text_cfg": {
"context_length": 77,
"vocab_size": 49408,
"width": 1152,
"heads": 16,
"layers": 27
},
"custom_text": true
}
Loading

0 comments on commit 8cf653a

Please sign in to comment.