From dc5c7989aed066798bea4785603d486328db652e Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Thu, 20 Feb 2025 12:07:49 -0800 Subject: [PATCH 1/3] Prep for siglip2 release --- timm/layers/attention_pool.py | 12 +- timm/models/vision_transformer.py | 287 ++++++++++++++++++++++++++++-- 2 files changed, 283 insertions(+), 16 deletions(-) diff --git a/timm/layers/attention_pool.py b/timm/layers/attention_pool.py index da5585b363..2e87566ad4 100644 --- a/timm/layers/attention_pool.py +++ b/timm/layers/attention_pool.py @@ -29,6 +29,7 @@ def __init__( pos_embed: str = '', pool_type: str = 'token', norm_layer: Optional[nn.Module] = None, + act_layer: Optional[nn.Module] = nn.GELU, drop: float = 0.0, ): super().__init__() @@ -54,13 +55,18 @@ def __init__( self.q = nn.Linear(embed_dim, embed_dim, bias=qkv_bias) self.kv = nn.Linear(embed_dim, embed_dim * 2, bias=qkv_bias) - self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() - self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + if qk_norm: + qk_norm_layer = norm_layer or nn.LayerNorm + self.q_norm = qk_norm_layer(self.head_dim) + self.k_norm = qk_norm_layer(self.head_dim) + else: + self.q_norm = nn.Identity() + self.k_norm = nn.Identity() self.proj = nn.Linear(embed_dim, embed_dim) self.proj_drop = nn.Dropout(drop) self.norm = norm_layer(out_features) if norm_layer is not None else nn.Identity() - self.mlp = Mlp(embed_dim, int(embed_dim * mlp_ratio)) + self.mlp = Mlp(embed_dim, int(embed_dim * mlp_ratio), act_layer=act_layer) self.init_weights() diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index bba7afac4f..9a9c28e37c 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -584,6 +584,7 @@ def __init__( num_heads=num_heads, mlp_ratio=mlp_ratio, norm_layer=norm_layer, + act_layer=act_layer, ) else: self.attn_pool = None @@ -1887,9 +1888,20 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: license='cc-by-nc-4.0', mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0), + 'vit_base_patch32_siglip_256.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), + 'vit_base_patch16_siglip_224.v2_webli': _cfg( + # hf_hub_id='timm/', + num_classes=0), 'vit_base_patch16_siglip_224.webli': _cfg( hf_hub_id='timm/', num_classes=0), + 'vit_base_patch16_siglip_256.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), 'vit_base_patch16_siglip_256.webli': _cfg( hf_hub_id='timm/', input_size=(3, 256, 256), @@ -1898,28 +1910,51 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), + 'vit_base_patch16_siglip_384.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 384, 384), + num_classes=0), 'vit_base_patch16_siglip_384.webli': _cfg( hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), + 'vit_base_patch16_siglip_512.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 512, 512), + num_classes=0), 'vit_base_patch16_siglip_512.webli': _cfg( hf_hub_id='timm/', input_size=(3, 512, 512), num_classes=0), + 'vit_large_patch16_siglip_256.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), 'vit_large_patch16_siglip_256.webli': _cfg( hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), + 'vit_large_patch16_siglip_384.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 384, 384), + num_classes=0), 'vit_large_patch16_siglip_384.webli': _cfg( hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), + 'vit_large_patch16_siglip_512.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 512, 512), + num_classes=0), + 'vit_so400m_patch14_siglip_224.v2_webli': _cfg( + # hf_hub_id='timm/', + num_classes=0), 'vit_so400m_patch14_siglip_224.webli': _cfg( hf_hub_id='timm/', num_classes=0), - 'vit_so400m_patch16_siglip_256.webli_i18n': _cfg( - hf_hub_id='timm/', - input_size=(3, 256, 256), + 'vit_so400m_patch14_siglip_378.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 378, 378), num_classes=0), 'vit_so400m_patch14_siglip_378.webli': _cfg( hf_hub_id='timm/', @@ -1929,10 +1964,45 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), + 'vit_so400m_patch16_siglip_256.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), + 'vit_so400m_patch16_siglip_256.webli_i18n': _cfg( + hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), + 'vit_so400m_patch16_siglip_384.v2_webli': _cfg( + #hf_hub_id='timm/', + input_size=(3, 384, 384), + num_classes=0), + 'vit_so400m_patch16_siglip_512.v2_webli': _cfg( + #hf_hub_id='timm/', + input_size=(3, 512, 512), + num_classes=0), + 'vit_giantopt_patch16_siglip_256.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), + 'vit_giantopt_patch16_siglip_384.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 384, 384), + num_classes=0), + 'vit_base_patch32_siglip_gap_256.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), + 'vit_base_patch16_siglip_gap_224.v2_webli': _cfg( + # hf_hub_id='timm/', + num_classes=0), 'vit_base_patch16_siglip_gap_224.webli': _cfg( hf_hub_id='timm/', num_classes=0), + 'vit_base_patch16_siglip_gap_256.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), 'vit_base_patch16_siglip_gap_256.webli': _cfg( hf_hub_id='timm/', input_size=(3, 256, 256), @@ -1941,22 +2011,45 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), + 'vit_base_patch16_siglip_gap_384.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 384, 384), + num_classes=0), 'vit_base_patch16_siglip_gap_384.webli': _cfg( hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), + 'vit_base_patch16_siglip_gap_512.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 512, 512), + num_classes=0), 'vit_base_patch16_siglip_gap_512.webli': _cfg( hf_hub_id='timm/', input_size=(3, 512, 512), num_classes=0), + 'vit_large_patch16_siglip_gap_256.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), 'vit_large_patch16_siglip_gap_256.webli': _cfg( hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), + 'vit_large_patch16_siglip_gap_384.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 384, 384), + num_classes=0), 'vit_large_patch16_siglip_gap_384.webli': _cfg( hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), + 'vit_large_patch16_siglip_gap_512.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 512, 512), + num_classes=0), + 'vit_so400m_patch14_siglip_gap_224.v2_webli': _cfg( + # hf_hub_id='timm/', + num_classes=0), 'vit_so400m_patch14_siglip_gap_224.webli': _cfg( hf_hub_id='timm/', num_classes=0), @@ -1977,9 +2070,9 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: # hf_hub_filename='pt_27b_224.npz', # custom_load='hf', # num_classes=0), - 'vit_so400m_patch16_siglip_gap_256.webli_i18n': _cfg( - hf_hub_id='timm/', - input_size=(3, 256, 256), + 'vit_so400m_patch14_siglip_gap_378.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 378, 378), num_classes=0), 'vit_so400m_patch14_siglip_gap_378.webli': _cfg( hf_hub_id='timm/', @@ -2053,6 +2146,30 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: # custom_load='hf', # input_size=(3, 896, 896), crop_pct=1.0, # num_classes=0), + 'vit_so400m_patch16_siglip_gap_256.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), + 'vit_so400m_patch16_siglip_gap_256.webli_i18n': _cfg( + hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), + 'vit_so400m_patch16_siglip_gap_384.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 384, 384), + num_classes=0), + 'vit_so400m_patch16_siglip_gap_512.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 512, 512), + num_classes=0), + 'vit_giantopt_patch16_siglip_gap_256.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 256, 256), + num_classes=0), + 'vit_giantopt_patch16_siglip_gap_384.v2_webli': _cfg( + # hf_hub_id='timm/', + input_size=(3, 384, 384), + num_classes=0), 'vit_so400m_patch14_siglip_378.webli_ft_in1k': _cfg( hf_hub_id='timm/', @@ -3114,6 +3231,17 @@ def vit_giant_patch14_reg4_dinov2(pretrained: bool = False, **kwargs) -> VisionT return model +@register_model +def vit_base_patch32_siglip_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=32, embed_dim=768, depth=12, num_heads=12, class_token=False, global_pool='map', + act_layer='gelu_tanh', + ) + model = _create_vision_transformer( + 'vit_base_patch32_siglip_256', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + @register_model def vit_base_patch16_siglip_224(pretrained: bool = False, **kwargs) -> VisionTransformer: model_args = dict( @@ -3175,23 +3303,23 @@ def vit_large_patch16_siglip_384(pretrained: bool = False, **kwargs) -> VisionTr @register_model -def vit_so400m_patch14_siglip_224(pretrained: bool = False, **kwargs) -> VisionTransformer: +def vit_large_patch16_siglip_512(pretrained: bool = False, **kwargs) -> VisionTransformer: model_args = dict( - patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map', + patch_size=16, embed_dim=1024, depth=24, num_heads=16, class_token=False, global_pool='map', + act_layer='gelu_tanh' ) model = _create_vision_transformer( - 'vit_so400m_patch14_siglip_224', pretrained=pretrained, **dict(model_args, **kwargs)) + 'vit_large_patch16_siglip_512', pretrained=pretrained, **dict(model_args, **kwargs)) return model @register_model -def vit_so400m_patch16_siglip_256(pretrained: bool = False, **kwargs) -> VisionTransformer: - # this is a corrected variant of the 384 with a res properly divisible by patch size (no padding/truncation) +def vit_so400m_patch14_siglip_224(pretrained: bool = False, **kwargs) -> VisionTransformer: model_args = dict( - patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map', + patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map', ) model = _create_vision_transformer( - 'vit_so400m_patch16_siglip_256', pretrained=pretrained, **dict(model_args, **kwargs)) + 'vit_so400m_patch14_siglip_224', pretrained=pretrained, **dict(model_args, **kwargs)) return model @@ -3216,6 +3344,72 @@ def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionT return model +@register_model +def vit_so400m_patch16_siglip_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map', + act_layer='gelu_tanh', + ) + model = _create_vision_transformer( + 'vit_so400m_patch16_siglip_256', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + +@register_model +def vit_so400m_patch16_siglip_384(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map', + act_layer='gelu_tanh', + ) + model = _create_vision_transformer( + 'vit_so400m_patch16_siglip_384', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + +@register_model +def vit_so400m_patch16_siglip_512(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, global_pool='map', + act_layer='gelu_tanh', + ) + model = _create_vision_transformer( + 'vit_so400m_patch16_siglip_512', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + +@register_model +def vit_giantopt_patch16_siglip_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1536, depth=40, num_heads=16, class_token=False, global_pool='map', + act_layer='gelu_tanh', + ) + model = _create_vision_transformer( + 'vit_giantopt_patch16_siglip_256', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + +@register_model +def vit_giantopt_patch16_siglip_384(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1536, depth=40, num_heads=16, class_token=False, global_pool='map', + act_layer='gelu_tanh', + ) + model = _create_vision_transformer( + 'vit_giantopt_patch16_siglip_384', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + +@register_model +def vit_base_patch32_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=32, embed_dim=768, depth=12, num_heads=12, class_token=False, global_pool='avg', fc_norm=False, + act_layer='gelu_tanh', + ) + model = _create_vision_transformer( + 'vit_base_patch32_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + @register_model def vit_base_patch16_siglip_gap_224(pretrained: bool = False, **kwargs) -> VisionTransformer: """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP).""" @@ -3282,6 +3476,17 @@ def vit_large_patch16_siglip_gap_384(pretrained: bool = False, **kwargs) -> Visi return model +@register_model +def vit_large_patch16_siglip_gap_512(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, class_token=False, + global_pool='avg', fc_norm=False, act_layer='gelu_tanh' + ) + model = _create_vision_transformer( + 'vit_large_patch16_siglip_gap_512', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + @register_model def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> VisionTransformer: """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP).""" @@ -3354,6 +3559,62 @@ def vit_so400m_patch14_siglip_gap_896(pretrained: bool = False, **kwargs) -> Vis return model +@register_model +def vit_so400m_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, + global_pool='avg', fc_norm=False, act_layer='gelu_tanh' + ) + model = _create_vision_transformer( + 'vit_so400m_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + +@register_model +def vit_so400m_patch16_siglip_gap_384(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, + global_pool='avg', fc_norm=False, act_layer='gelu_tanh' + ) + model = _create_vision_transformer( + 'vit_so400m_patch16_siglip_gap_384', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + +@register_model +def vit_so400m_patch16_siglip_gap_512(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, + global_pool='avg', fc_norm=False, act_layer='gelu_tanh' + ) + model = _create_vision_transformer( + 'vit_so400m_patch16_siglip_gap_512', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + +@register_model +def vit_giantopt_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1536, depth=40, num_heads=16, class_token=False, + global_pool='avg', fc_norm=False, act_layer='gelu_tanh' + ) + model = _create_vision_transformer( + 'vit_giantopt_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + +@register_model +def vit_giantopt_patch16_siglip_gap_384(pretrained: bool = False, **kwargs) -> VisionTransformer: + model_args = dict( + patch_size=16, embed_dim=1536, depth=40, num_heads=16, class_token=False, + global_pool='avg', fc_norm=False, act_layer='gelu_tanh' + ) + model = _create_vision_transformer( + 'vit_giantopt_patch16_siglip_gap_384', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + + @register_model def vit_wee_patch16_reg1_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer: model_args = dict( From 25de6b4ee1b154abf9d566a16b744ca16471c636 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Thu, 20 Feb 2025 16:14:58 -0800 Subject: [PATCH 2/3] Remove duplicate so400m/16 @ 256 model def --- timm/models/vision_transformer.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 9a9c28e37c..d328338e18 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -3499,18 +3499,6 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis return model -@register_model -def vit_so400m_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer: - """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP).""" - model_args = dict( - patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, - class_token=False, global_pool='avg', fc_norm=False, - ) - model = _create_vision_transformer( - 'vit_so400m_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs)) - return model - - @register_model def vit_so400m_patch14_siglip_gap_378(pretrained: bool = False, **kwargs) -> VisionTransformer: """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP).""" @@ -3561,9 +3549,10 @@ def vit_so400m_patch14_siglip_gap_896(pretrained: bool = False, **kwargs) -> Vis @register_model def vit_so400m_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP).""" model_args = dict( - patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, - global_pool='avg', fc_norm=False, act_layer='gelu_tanh' + patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, + class_token=False, global_pool='avg', fc_norm=False, act_layer='gelu_tanh', ) model = _create_vision_transformer( 'vit_so400m_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs)) From 228e080e39ce5d7599ba91a311b59bbf6fd3f93a Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Fri, 21 Feb 2025 12:46:14 -0800 Subject: [PATCH 3/3] siglip2 weights on hub, fix forward_intermediates when no prefix tokens (& return prefix selected) --- timm/models/vision_transformer.py | 65 ++++++++++++++++--------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index d328338e18..3c7b9a2277 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -769,11 +769,14 @@ def forward_intermediates( # split prefix (e.g. class, distill) and spatial feature tokens prefix_tokens = [y[:, 0:self.num_prefix_tokens] for y in intermediates] intermediates = [y[:, self.num_prefix_tokens:] for y in intermediates] + else: + prefix_tokens = None + if reshape: # reshape to BCHW output format H, W = self.patch_embed.dynamic_feat_size((height, width)) intermediates = [y.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() for y in intermediates] - if not torch.jit.is_scripting() and return_prefix_tokens: + if not torch.jit.is_scripting() and return_prefix_tokens and prefix_tokens is not None: # return_prefix not support in torchscript due to poor type handling intermediates = list(zip(intermediates, prefix_tokens)) @@ -1889,17 +1892,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0), 'vit_base_patch32_siglip_256.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_base_patch16_siglip_224.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', num_classes=0), 'vit_base_patch16_siglip_224.webli': _cfg( hf_hub_id='timm/', num_classes=0), 'vit_base_patch16_siglip_256.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_base_patch16_siglip_256.webli': _cfg( @@ -1911,7 +1914,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 256, 256), num_classes=0), 'vit_base_patch16_siglip_384.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_base_patch16_siglip_384.webli': _cfg( @@ -1919,7 +1922,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 384, 384), num_classes=0), 'vit_base_patch16_siglip_512.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 512, 512), num_classes=0), 'vit_base_patch16_siglip_512.webli': _cfg( @@ -1927,7 +1930,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 512, 512), num_classes=0), 'vit_large_patch16_siglip_256.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_large_patch16_siglip_256.webli': _cfg( @@ -1935,7 +1938,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 256, 256), num_classes=0), 'vit_large_patch16_siglip_384.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_large_patch16_siglip_384.webli': _cfg( @@ -1943,17 +1946,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 384, 384), num_classes=0), 'vit_large_patch16_siglip_512.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 512, 512), num_classes=0), 'vit_so400m_patch14_siglip_224.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', num_classes=0), 'vit_so400m_patch14_siglip_224.webli': _cfg( hf_hub_id='timm/', num_classes=0), 'vit_so400m_patch14_siglip_378.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 378, 378), num_classes=0), 'vit_so400m_patch14_siglip_378.webli': _cfg( @@ -1965,7 +1968,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 384, 384), num_classes=0), 'vit_so400m_patch16_siglip_256.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_so400m_patch16_siglip_256.webli_i18n': _cfg( @@ -1973,34 +1976,34 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 256, 256), num_classes=0), 'vit_so400m_patch16_siglip_384.v2_webli': _cfg( - #hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_so400m_patch16_siglip_512.v2_webli': _cfg( - #hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 512, 512), num_classes=0), 'vit_giantopt_patch16_siglip_256.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_giantopt_patch16_siglip_384.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_base_patch32_siglip_gap_256.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_base_patch16_siglip_gap_224.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', num_classes=0), 'vit_base_patch16_siglip_gap_224.webli': _cfg( hf_hub_id='timm/', num_classes=0), 'vit_base_patch16_siglip_gap_256.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_base_patch16_siglip_gap_256.webli': _cfg( @@ -2012,7 +2015,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 256, 256), num_classes=0), 'vit_base_patch16_siglip_gap_384.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_base_patch16_siglip_gap_384.webli': _cfg( @@ -2020,7 +2023,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 384, 384), num_classes=0), 'vit_base_patch16_siglip_gap_512.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 512, 512), num_classes=0), 'vit_base_patch16_siglip_gap_512.webli': _cfg( @@ -2028,7 +2031,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 512, 512), num_classes=0), 'vit_large_patch16_siglip_gap_256.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_large_patch16_siglip_gap_256.webli': _cfg( @@ -2036,7 +2039,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 256, 256), num_classes=0), 'vit_large_patch16_siglip_gap_384.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_large_patch16_siglip_gap_384.webli': _cfg( @@ -2044,11 +2047,11 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 384, 384), num_classes=0), 'vit_large_patch16_siglip_gap_512.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 512, 512), num_classes=0), 'vit_so400m_patch14_siglip_gap_224.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', num_classes=0), 'vit_so400m_patch14_siglip_gap_224.webli': _cfg( hf_hub_id='timm/', @@ -2071,7 +2074,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: # custom_load='hf', # num_classes=0), 'vit_so400m_patch14_siglip_gap_378.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 378, 378), num_classes=0), 'vit_so400m_patch14_siglip_gap_378.webli': _cfg( @@ -2147,7 +2150,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: # input_size=(3, 896, 896), crop_pct=1.0, # num_classes=0), 'vit_so400m_patch16_siglip_gap_256.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_so400m_patch16_siglip_gap_256.webli_i18n': _cfg( @@ -2155,19 +2158,19 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 256, 256), num_classes=0), 'vit_so400m_patch16_siglip_gap_384.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_so400m_patch16_siglip_gap_512.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 512, 512), num_classes=0), 'vit_giantopt_patch16_siglip_gap_256.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_giantopt_patch16_siglip_gap_384.v2_webli': _cfg( - # hf_hub_id='timm/', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0),