Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions timm/models/byobnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2315,6 +2315,27 @@ def _cfgr(url='', **kwargs):
fixed_input_size=True, input_size=(3, 448, 448), pool_size=(14, 14),
classifier='head.proj',
),
'resnet50_clip.cc12m': _cfgr(
hf_hub_id='timm/',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
classifier='head.proj',
),
'resnet50_clip.yfcc15m': _cfgr(
hf_hub_id='timm/',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
classifier='head.proj',
),
'resnet101_clip.yfcc15m': _cfgr(
hf_hub_id='timm/',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
classifier='head.proj',
),

# avg-pool w/ optional standard classifier head variants
'resnet50_clip_gap.openai': _cfgr(
Expand Down Expand Up @@ -2347,6 +2368,24 @@ def _cfgr(url='', **kwargs):
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
input_size=(3, 448, 448), pool_size=(14, 14),
),
'resnet50_clip_gap.cc12m': _cfgr(
hf_hub_id='timm/resnet50_clip.cc12m',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
input_size=(3, 224, 224), pool_size=(7, 7),
),
'resnet50_clip_gap.yfcc15m': _cfgr(
hf_hub_id='timm/resnet50_clip.yfcc15m',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
input_size=(3, 224, 224), pool_size=(7, 7),
),
'resnet101_clip_gap.yfcc15m': _cfgr(
hf_hub_id='timm/resnet101_clip.yfcc15m',
hf_hub_filename='open_clip_pytorch_model.bin',
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
input_size=(3, 224, 224), pool_size=(7, 7),
),

'resnet50_mlp.untrained': _cfgr(
input_size=(3, 256, 256), pool_size=(8, 8),
Expand Down
101 changes: 73 additions & 28 deletions timm/models/vision_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

Hacked together by / Copyright 2020, Ross Wightman
"""
import copy
import logging
import math
from collections import OrderedDict
Expand Down Expand Up @@ -1601,6 +1602,21 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
hf_hub_filename='open_clip_pytorch_model.bin',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),

'vit_base_patch32_clip_224.laion400m_e32': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
'vit_base_patch16_clip_224.laion400m_e32': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
'vit_base_patch16_plus_clip_240.laion400m_e32': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
input_size=(3, 240, 240), crop_pct=1.0, num_classes=512),
'vit_large_patch14_clip_224.laion400m_e32': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),

'vit_base_patch32_clip_224.datacompxl': _cfg(
hf_hub_id='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K',
hf_hub_filename='open_clip_pytorch_model.bin',
Expand Down Expand Up @@ -1641,44 +1657,60 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024),

'vit_base_patch32_clip_224.metaclip_2pt5b': _cfg(
hf_hub_id='facebook/metaclip-b32-fullcc2.5b',
hf_hub_filename='metaclip_b32_fullcc2.5b.bin',
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
'vit_base_patch16_clip_224.metaclip_2pt5b': _cfg(
hf_hub_id='facebook/metaclip-b16-fullcc2.5b',
hf_hub_filename='metaclip_b16_fullcc2.5b.bin',
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
'vit_large_patch14_clip_224.metaclip_2pt5b': _cfg(
hf_hub_id='facebook/metaclip-l14-fullcc2.5b',
hf_hub_filename='metaclip_l14_fullcc2.5b.bin',
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
'vit_huge_patch14_clip_224.metaclip_2pt5b': _cfg(
hf_hub_id='facebook/metaclip-h14-fullcc2.5b',
hf_hub_filename='metaclip_h14_fullcc2.5b.bin',
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
'vit_gigantic_patch14_clip_224.metaclip_2pt5b': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),
'vit_base_patch32_clip_224.metaclip_400m': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
'vit_base_patch16_clip_224.metaclip_400m': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
'vit_large_patch14_clip_224.metaclip_400m': _cfg(
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
license='cc-by-nc-4.0',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),

'vit_base_patch32_clip_224.openai': _cfg(
hf_hub_id='timm/vit_base_patch32_clip_224.openai',
hf_hub_id='timm/',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
'vit_base_patch16_clip_224.openai': _cfg(
hf_hub_id='timm/vit_base_patch16_clip_224.openai',
hf_hub_id='timm/',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
'vit_large_patch14_clip_224.openai': _cfg(
hf_hub_id='timm/vit_large_patch14_clip_224.openai',
hf_hub_id='timm/',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
'vit_large_patch14_clip_336.openai': _cfg(
hf_hub_id='timm/vit_large_patch14_clip_336.openai', hf_hub_filename='open_clip_pytorch_model.bin',
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
notes=('natively QuickGELU, use quickgelu model variant for original results',),
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
crop_pct=1.0, input_size=(3, 336, 336), num_classes=768),
Expand Down Expand Up @@ -2071,22 +2103,13 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
input_size=(3, 160, 160), crop_pct=0.95),
}

_quick_gelu_cfgs = [
'vit_large_patch14_clip_224.dfn2b',
'vit_huge_patch14_clip_224.dfn5b',
'vit_huge_patch14_clip_378.dfn5b',
'vit_base_patch32_clip_224.metaclip_2pt5b',
'vit_base_patch16_clip_224.metaclip_2pt5b',
'vit_large_patch14_clip_224.metaclip_2pt5b',
'vit_huge_patch14_clip_224.metaclip_2pt5b',
'vit_base_patch32_clip_224.openai',
'vit_base_patch16_clip_224.openai',
'vit_large_patch14_clip_224.openai',
'vit_large_patch14_clip_336.openai',
]
default_cfgs.update({
n.replace('_clip_', '_clip_quickgelu_'): default_cfgs[n] for n in _quick_gelu_cfgs
})
_quick_gelu_cfgs = [n for n, c in default_cfgs.items() if c.get('notes', ()) and 'quickgelu' in c['notes'][0]]
for n in _quick_gelu_cfgs:
# generate quickgelu default cfgs based on contents of notes field
c = copy.deepcopy(default_cfgs[n])
if c['hf_hub_id'] == 'timm/':
c['hf_hub_id'] = 'timm/' + n # need to use non-quickgelu model name for hub id
default_cfgs[n.replace('_clip_', '_clip_quickgelu_')] = c
default_cfgs = generate_default_cfgs(default_cfgs)


Expand Down Expand Up @@ -2510,6 +2533,16 @@ def vit_base_patch16_clip_384(pretrained: bool = False, **kwargs) -> VisionTrans
return model


@register_model
def vit_base_patch16_plus_clip_240(pretrained: bool = False, **kwargs) -> VisionTransformer:
""" ViT-Base (ViT-B/16+) CLIP image tower @ 240x240
"""
model_args = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_base_patch16_plus_clip_240', pretrained=pretrained, **dict(model_args, **kwargs))
return model


@register_model
def vit_large_patch14_clip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
""" ViT-Large model (ViT-L/14) CLIP image tower
Expand Down Expand Up @@ -2656,6 +2689,18 @@ def vit_huge_patch14_clip_quickgelu_378(pretrained: bool = False, **kwargs) -> V
return model


@register_model
def vit_gigantic_patch14_clip_quickgelu_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
""" ViT-bigG model (ViT-G/14) w/ QuickGELU act
"""
model_args = dict(
patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16, pre_norm=True,
norm_layer=nn.LayerNorm, act_layer='quick_gelu')
model = _create_vision_transformer(
'vit_gigantic_patch14_clip_quickgelu_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model


# Experimental models below

@register_model
Expand Down
Loading