From 447147a25b5c1e5cf910e5fad3f5ed38de3344ee Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 23 Dec 2024 16:52:08 -0800 Subject: [PATCH 1/4] Switching to timm specific weight instances for open_clip image encoders to facilitate hf-hub: use in timm and new transformers TimmWrapper --- timm/models/convnext.py | 30 ++++++----------- timm/models/vision_transformer.py | 56 ++++++++++++------------------- 2 files changed, 32 insertions(+), 54 deletions(-) diff --git a/timm/models/convnext.py b/timm/models/convnext.py index e682379f64..a6d1999bde 100644 --- a/timm/models/convnext.py +++ b/timm/models/convnext.py @@ -916,53 +916,43 @@ def _cfgv2(url='', **kwargs): # CLIP original image tower weights 'convnext_base.clip_laion2b': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laion2b_augreg': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laiona': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laiona_320': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laiona_augreg_320': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640), 'convnext_large_mlp.clip_laion2b_augreg': _cfg( - hf_hub_id='laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=768), 'convnext_large_mlp.clip_laion2b_ft_320': _cfg( - hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768), 'convnext_large_mlp.clip_laion2b_ft_soup_320': _cfg( - hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768), 'convnext_xxlarge.clip_laion2b_soup': _cfg( - hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024), 'convnext_xxlarge.clip_laion2b_rewind': _cfg( - hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024), diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index b3b0ddca07..63526c93b7 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -1556,9 +1556,6 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), - 'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg( - #hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k', # FIXME weight exists, need to push - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), 'vit_base_patch16_clip_224.laion2b_ft_in12k': _cfg( hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), @@ -1569,9 +1566,6 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821), - 'vit_base_patch32_clip_224.openai_ft_in12k': _cfg( - # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k', # FIXME weight exists, need to push - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), 'vit_base_patch16_clip_224.openai_ft_in12k': _cfg( hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), @@ -1580,28 +1574,22 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821), 'vit_base_patch32_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), 'vit_base_patch16_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=768), 'vit_huge_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), 'vit_giant_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), 'vit_gigantic_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280), 'vit_base_patch32_clip_224.laion400m_e32': _cfg( @@ -1620,21 +1608,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_base_patch32_clip_224.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch32_clip_256.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 256, 256), num_classes=512), 'vit_base_patch16_clip_224.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_base_patch16_clip_224.dfn2b': _cfg( @@ -1659,42 +1643,46 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024), 'vit_base_patch32_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch16_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_huge_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), + 'vit_huge_patch14_clip_224.metaclip_altogether': _cfg( + hf_hub_id='timm/', + license='cc-by-nc-4.0', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), 'vit_gigantic_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280), 'vit_base_patch32_clip_224.metaclip_400m': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch16_clip_224.metaclip_400m': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.metaclip_400m': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), From 7533a7f0c24043cf514f0f1c57c2709bfff04251 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 23 Dec 2024 17:40:21 -0800 Subject: [PATCH 2/4] Move siglip timm weights to own repos --- timm/models/vision_transformer.py | 66 +++++++++++-------------------- 1 file changed, 22 insertions(+), 44 deletions(-) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 63526c93b7..d9f5caf15c 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -1834,96 +1834,77 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0), 'vit_base_patch16_siglip_224.webli': _cfg( - hf_hub_id='timm/ViT-B-16-SigLIP', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0), 'vit_base_patch16_siglip_256.webli': _cfg( - hf_hub_id='timm/ViT-B-16-SigLIP-256', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_base_patch16_siglip_256.webli_i18n': _cfg( - hf_hub_id='timm/ViT-B-16-SigLIP-i18n-256', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_base_patch16_siglip_384.webli': _cfg( - hf_hub_id='timm/ViT-B-16-SigLIP-384', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_base_patch16_siglip_512.webli': _cfg( - hf_hub_id='timm/ViT-B-16-SigLIP-512', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 512, 512), num_classes=0), 'vit_large_patch16_siglip_256.webli': _cfg( - hf_hub_id='timm/ViT-L-16-SigLIP-256', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_large_patch16_siglip_384.webli': _cfg( - hf_hub_id='timm/ViT-L-16-SigLIP-384', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_so400m_patch14_siglip_224.webli': _cfg( - hf_hub_id='timm/ViT-SO400M-14-SigLIP', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0), 'vit_so400m_patch16_siglip_256.webli_i18n': _cfg( - hf_hub_id='timm/ViT-SO400M-16-SigLIP-i18n-256', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_so400m_patch14_siglip_378.webli': _cfg( - hf_hub_id='timm/ViT-SO400M-14-SigLIP-384', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 378, 378), num_classes=0), 'vit_so400m_patch14_siglip_384.webli': _cfg( - hf_hub_id='timm/ViT-SO400M-14-SigLIP-384', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_base_patch16_siglip_gap_224.webli': _cfg( - hf_hub_id='timm/ViT-B-16-SigLIP', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0), 'vit_base_patch16_siglip_gap_256.webli': _cfg( - hf_hub_id='timm/ViT-B-16-SigLIP-256', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_base_patch16_siglip_gap_256.webli_i18n': _cfg( - hf_hub_id='timm/ViT-B-16-SigLIP-i18n-256', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_base_patch16_siglip_gap_384.webli': _cfg( - hf_hub_id='timm/ViT-B-16-SigLIP-384', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_base_patch16_siglip_gap_512.webli': _cfg( - hf_hub_id='timm/ViT-B-16-SigLIP-512', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 512, 512), num_classes=0), 'vit_large_patch16_siglip_gap_256.webli': _cfg( - hf_hub_id='timm/ViT-L-16-SigLIP-256', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_large_patch16_siglip_gap_384.webli': _cfg( - hf_hub_id='timm/ViT-L-16-SigLIP-384', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 384, 384), num_classes=0), 'vit_so400m_patch14_siglip_gap_224.webli': _cfg( - hf_hub_id='timm/ViT-SO400M-14-SigLIP', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0), 'vit_so400m_patch14_siglip_gap_224.pali_mix': _cfg( hf_hub_id='google/paligemma-3b-mix-224-jax', @@ -1936,18 +1917,15 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: custom_load='hf', num_classes=0), 'vit_so400m_patch16_siglip_gap_256.webli_i18n': _cfg( - hf_hub_id='timm/ViT-SO400M-16-SigLIP-i18n-256', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 256, 256), num_classes=0), 'vit_so400m_patch14_siglip_gap_378.webli': _cfg( - hf_hub_id='timm/ViT-SO400M-14-SigLIP-384', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 378, 378), crop_pct=1.0, num_classes=0), 'vit_so400m_patch14_siglip_gap_384.webli': _cfg( - hf_hub_id='timm/ViT-SO400M-14-SigLIP-384', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 384, 384), crop_pct=1.0, num_classes=0), 'vit_so400m_patch14_siglip_gap_448.pali_mix': _cfg( From 4f4f40baa69767db2fa272cf7ac037485514bd13 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Fri, 27 Dec 2024 12:04:04 -0800 Subject: [PATCH 3/4] Add support for tag, license customization through push_to_hub --- timm/models/_hub.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/timm/models/_hub.py b/timm/models/_hub.py index 4922dfc09a..809705ed39 100644 --- a/timm/models/_hub.py +++ b/timm/models/_hub.py @@ -395,10 +395,18 @@ def push_to_hf_hub( def generate_readme(model_card: dict, model_name: str): + tags = model_card.get('tags', None) or ['image-classification', 'timm'] readme_text = "---\n" - readme_text += "tags:\n- image-classification\n- timm\n" - readme_text += "library_name: timm\n" + if tags: + readme_text += "tags:\n" + for t in tags: + readme_text += f"- {t}\n" + readme_text += f"library_name: {model_card.get('library_name', 'timm')}\n" readme_text += f"license: {model_card.get('license', 'apache-2.0')}\n" + if 'license_name' in model_card: + readme_text += f"license_name: {model_card.get('license_name')}\n" + if 'license_link' in model_card: + readme_text += f"license_link: {model_card.get('license_link')}\n" if 'details' in model_card and 'Dataset' in model_card['details']: readme_text += 'datasets:\n' if isinstance(model_card['details']['Dataset'], (tuple, list)): From 5cf022f22879ab450dc06f705864d4661f2ed90d Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Fri, 27 Dec 2024 12:05:22 -0800 Subject: [PATCH 4/4] Add more pali(2) weights. Switch rest of models adapting open_clip weights to their own weight instances. --- timm/models/byobnet.py | 32 ++----- timm/models/eva.py | 35 ++++--- timm/models/hieradet_sam2.py | 65 ++++++++----- timm/models/vision_transformer.py | 153 +++++++++++++++++++++--------- 4 files changed, 178 insertions(+), 107 deletions(-) diff --git a/timm/models/byobnet.py b/timm/models/byobnet.py index 93ca5e72c9..12433c8497 100644 --- a/timm/models/byobnet.py +++ b/timm/models/byobnet.py @@ -2282,56 +2282,48 @@ def _cfgr(url='', **kwargs): # original attention pool head variants 'resnet50_clip.openai': _cfgr( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7), classifier='head.proj', ), 'resnet101_clip.openai': _cfgr( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7), classifier='head.proj', ), 'resnet50x4_clip.openai': _cfgr( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', num_classes=640, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, fixed_input_size=True, input_size=(3, 288, 288), pool_size=(9, 9), classifier='head.proj', ), 'resnet50x16_clip.openai': _cfgr( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', num_classes=768, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, fixed_input_size=True, input_size=(3, 384, 384), pool_size=(12, 12), classifier='head.proj', ), 'resnet50x64_clip.openai': _cfgr( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, fixed_input_size=True, input_size=(3, 448, 448), pool_size=(14, 14), classifier='head.proj', ), 'resnet50_clip.cc12m': _cfgr( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7), classifier='head.proj', ), 'resnet50_clip.yfcc15m': _cfgr( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7), classifier='head.proj', ), 'resnet101_clip.yfcc15m': _cfgr( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7), classifier='head.proj', @@ -2339,50 +2331,42 @@ def _cfgr(url='', **kwargs): # avg-pool w/ optional standard classifier head variants 'resnet50_clip_gap.openai': _cfgr( - hf_hub_id='timm/resnet50_clip.openai', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 224, 224), pool_size=(7, 7), ), 'resnet101_clip_gap.openai': _cfgr( - hf_hub_id='timm/resnet101_clip.openai', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 224, 224), pool_size=(7, 7), ), 'resnet50x4_clip_gap.openai': _cfgr( - hf_hub_id='timm/resnet50x4_clip.openai', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 288, 288), pool_size=(9, 9), ), 'resnet50x16_clip_gap.openai': _cfgr( - hf_hub_id='timm/resnet50x16_clip.openai', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 384, 384), pool_size=(12, 12), ), 'resnet50x64_clip_gap.openai': _cfgr( - hf_hub_id='timm/resnet50x64_clip.openai', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 448, 448), pool_size=(14, 14), ), 'resnet50_clip_gap.cc12m': _cfgr( - hf_hub_id='timm/resnet50_clip.cc12m', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 224, 224), pool_size=(7, 7), ), 'resnet50_clip_gap.yfcc15m': _cfgr( - hf_hub_id='timm/resnet50_clip.yfcc15m', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 224, 224), pool_size=(7, 7), ), 'resnet101_clip_gap.yfcc15m': _cfgr( - hf_hub_id='timm/resnet101_clip.yfcc15m', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 224, 224), pool_size=(7, 7), ), diff --git a/timm/models/eva.py b/timm/models/eva.py index fe87154050..552965947b 100644 --- a/timm/models/eva.py +++ b/timm/models/eva.py @@ -912,45 +912,52 @@ def _cfg(url='', **kwargs): # EVA01 and EVA02 CLIP image towers 'eva_giant_patch14_clip_224.laion400m': _cfg( # hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA01_CLIP_g_14_plus_psz14_s11B.pt', - hf_hub_id='timm/eva_giant_patch14_clip_224.laion400m_s11b_b41k', # float16 weights - hf_hub_filename='open_clip_pytorch_model.bin', + # hf_hub_id='timm/eva_giant_patch14_clip_224.laion400m_s11b_b41k', # float16 weights + # hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=1024, ), 'eva_giant_patch14_clip_224.merged2b': _cfg( # hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA01_CLIP_g_14_plus_psz14_s11B.pt', - hf_hub_id='timm/eva_giant_patch14_plus_clip_224.merged2b_s11b_b114k', # float16 weights - hf_hub_filename='open_clip_pytorch_model.bin', + # hf_hub_id='timm/eva_giant_patch14_plus_clip_224.merged2b_s11b_b114k', # float16 weights + # hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=1024, ), 'eva02_base_patch16_clip_224.merged2b': _cfg( # hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_L_psz14_s4B.pt', - hf_hub_id='timm/eva02_base_patch16_clip_224.merged2b_s8b_b131k', # float16 weights - hf_hub_filename='open_clip_pytorch_model.bin', + # hf_hub_id='timm/eva02_base_patch16_clip_224.merged2b_s8b_b131k', # float16 weights + # hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=512, ), 'eva02_large_patch14_clip_224.merged2b': _cfg( # hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_L_psz14_s4B.pt', - hf_hub_id='timm/eva02_large_patch14_clip_224.merged2b_s4b_b131k', # float16 weights - hf_hub_filename='open_clip_pytorch_model.bin', + # hf_hub_id='timm/eva02_large_patch14_clip_224.merged2b_s4b_b131k', # float16 weights + # hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=768, ), 'eva02_large_patch14_clip_336.merged2b': _cfg( # hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_L_psz14_s4B.pt', - hf_hub_id='timm/eva02_large_patch14_clip_336.merged2b_s6b_b61k', # float16 weights - hf_hub_filename='open_clip_pytorch_model.bin', + # hf_hub_id='timm/eva02_large_patch14_clip_336.merged2b_s6b_b61k', # float16 weights + # hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', input_size=(3, 336, 336), crop_pct=1.0, num_classes=768, ), 'eva02_enormous_patch14_clip_224.laion2b': _cfg( # hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_E_psz14_plus_s9B.pt', - hf_hub_id='timm/eva02_enormous_patch14_clip_224.laion2b_s4b_b115k', # float16 weights - hf_hub_filename='open_clip_pytorch_model.bin', + # hf_hub_id='timm/eva02_enormous_patch14_clip_224.laion2b_s4b_b115k', # float16 weights + # hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=1024, ), 'eva02_enormous_patch14_clip_224.laion2b_plus': _cfg( # hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_E_psz14_plus_s9B.pt', - hf_hub_id='timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k', # bfloat16 weights - hf_hub_filename='open_clip_pytorch_model.bin', + # hf_hub_id='timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k', # bfloat16 weights + # hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', num_classes=1024, ), 'eva02_enormous_patch14_clip_224.pretrain': _cfg( diff --git a/timm/models/hieradet_sam2.py b/timm/models/hieradet_sam2.py index d9585a526c..6cd2592a95 100644 --- a/timm/models/hieradet_sam2.py +++ b/timm/models/hieradet_sam2.py @@ -530,26 +530,47 @@ def _cfg(url='', **kwargs): default_cfgs = generate_default_cfgs({ - "sam2_hiera_tiny.r224": _cfg( - hf_hub_id='facebook/sam2-hiera-tiny', - hf_hub_filename='sam2_hiera_tiny.pt', - input_size=(3, 224, 224), pool_size=(7, 7), - ), # FIXME reduced res for testing - "sam2_hiera_tiny.r896": _cfg( - hf_hub_id='facebook/sam2-hiera-tiny', - hf_hub_filename='sam2_hiera_tiny.pt', + "sam2_hiera_tiny.fb_r896": _cfg( + # hf_hub_id='facebook/sam2-hiera-tiny', + # hf_hub_filename='sam2_hiera_tiny.pt', + hf_hub_id='timm/', ), - "sam2_hiera_small": _cfg( - hf_hub_id='facebook/sam2-hiera-small', - hf_hub_filename='sam2_hiera_small.pt', + "sam2_hiera_tiny.fb_r896_2pt1": _cfg( + # hf_hub_id='facebook/sam2.1-hiera-tiny', + # hf_hub_filename='sam2.1_hiera_tiny.pt', + hf_hub_id='timm/', ), - "sam2_hiera_base_plus": _cfg( - hf_hub_id='facebook/sam2-hiera-base-plus', - hf_hub_filename='sam2_hiera_base_plus.pt', + "sam2_hiera_small.fb_r896": _cfg( + # hf_hub_id='facebook/sam2-hiera-small', + # hf_hub_filename='sam2_hiera_small.pt', + hf_hub_id='timm/', ), - "sam2_hiera_large": _cfg( - hf_hub_id='facebook/sam2-hiera-large', - hf_hub_filename='sam2_hiera_large.pt', + "sam2_hiera_small.fb_r896_2pt1": _cfg( + # hf_hub_id='facebook/sam2.1-hiera-small', + # hf_hub_filename='sam2.1_hiera_small.pt', + hf_hub_id='timm/', + ), + "sam2_hiera_base_plus.fb_r896": _cfg( + # hf_hub_id='facebook/sam2-hiera-base-plus', + # hf_hub_filename='sam2_hiera_base_plus.pt', + hf_hub_id='timm/', + ), + "sam2_hiera_base_plus.fb_r896_2pt1": _cfg( + # hf_hub_id='facebook/sam2.1-hiera-base-plus', + # hf_hub_filename='sam2.1_hiera_base_plus.pt', + hf_hub_id='timm/', + ), + "sam2_hiera_large.fb_r1024": _cfg( + # hf_hub_id='facebook/sam2-hiera-large', + # hf_hub_filename='sam2_hiera_large.pt', + hf_hub_id='timm/', + min_input_size=(3, 256, 256), + input_size=(3, 1024, 1024), pool_size=(32, 32), + ), + "sam2_hiera_large.fb_r1024_2pt1": _cfg( + # hf_hub_id='facebook/sam2.1-hiera-large', + # hf_hub_filename='sam2.1_hiera_large.pt', + hf_hub_id='timm/', min_input_size=(3, 256, 256), input_size=(3, 1024, 1024), pool_size=(32, 32), ), @@ -578,11 +599,11 @@ def checkpoint_filter_fn(state_dict, model=None, prefix=''): def _create_hiera_det(variant: str, pretrained: bool = False, **kwargs) -> HieraDet: out_indices = kwargs.pop('out_indices', 4) checkpoint_prefix = '' - if 'sam2' in variant: - # SAM2 pretrained weights have no classifier or final norm-layer (`head.norm`) - # This is workaround loading with num_classes=0 w/o removing norm-layer. - kwargs.setdefault('pretrained_strict', False) - checkpoint_prefix = 'image_encoder.trunk.' + # if 'sam2' in variant: + # # SAM2 pretrained weights have no classifier or final norm-layer (`head.norm`) + # # This is workaround loading with num_classes=0 w/o removing norm-layer. + # kwargs.setdefault('pretrained_strict', False) + # checkpoint_prefix = 'image_encoder.trunk.' return build_model_with_cfg( HieraDet, variant, diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index d9f5caf15c..6bc93dd1c6 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -912,26 +912,40 @@ def resize_pos_embed( @torch.no_grad() -def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = '') -> None: +def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = '', load_bfloat16: bool = False) -> None: """ Load weights from .npz checkpoints for official Google Brain Flax implementation """ import numpy as np + if load_bfloat16: + import jax.numpy as jnp + import ml_dtypes - def _n2p(w, t=True, idx=None): + def _n2p(_w, t=True, idx=None): if idx is not None: - w = w[idx] - if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1: - w = w.flatten() + _w = _w[idx] + + if load_bfloat16: + _w = _w.view(ml_dtypes.bfloat16).astype(jnp.float32) + _w = np.array(_w) + + if _w.ndim == 4 and _w.shape[0] == _w.shape[1] == _w.shape[2] == 1: + _w = _w.flatten() if t: - if w.ndim == 4: - w = w.transpose([3, 2, 0, 1]) - elif w.ndim == 3: - w = w.transpose([2, 0, 1]) - elif w.ndim == 2: - w = w.transpose([1, 0]) - return torch.from_numpy(w) - - w = np.load(checkpoint_path) + if _w.ndim == 4: + _w = _w.transpose([3, 2, 0, 1]) + elif _w.ndim == 3: + _w = _w.transpose([2, 0, 1]) + elif _w.ndim == 2: + _w = _w.transpose([1, 0]) + + _w = torch.from_numpy(_w) + return _w + + if load_bfloat16: + w = jnp.load(checkpoint_path) + else: + w = np.load(checkpoint_path) + interpolation = 'bilinear' antialias = False big_vision = False @@ -1593,18 +1607,18 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280), 'vit_base_patch32_clip_224.laion400m_e32': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), 'vit_base_patch16_clip_224.laion400m_e32': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch16_plus_clip_240.laion400m_e32': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, - input_size=(3, 240, 240), crop_pct=1.0, num_classes=512), + input_size=(3, 240, 240), crop_pct=1.0, num_classes=640), 'vit_large_patch14_clip_224.laion400m_e32': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_base_patch32_clip_224.datacompxl': _cfg( @@ -1622,22 +1636,18 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_base_patch16_clip_224.dfn2b': _cfg( - hf_hub_id='apple/DFN2B-CLIP-ViT-B-16', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.dfn2b': _cfg( - hf_hub_id='apple/DFN2B-CLIP-ViT-L-14', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_huge_patch14_clip_224.dfn5b': _cfg( - hf_hub_id='apple/DFN5B-CLIP-ViT-H-14', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), 'vit_huge_patch14_clip_378.dfn5b': _cfg( - hf_hub_id='apple/DFN5B-CLIP-ViT-H-14-378', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, notes=('natively QuickGELU, use quickgelu model variant for original results',), crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024), @@ -1700,7 +1710,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_large_patch14_clip_336.openai': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 336, 336), num_classes=768), @@ -1907,15 +1917,22 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: hf_hub_id='timm/', num_classes=0), 'vit_so400m_patch14_siglip_gap_224.pali_mix': _cfg( - hf_hub_id='google/paligemma-3b-mix-224-jax', - hf_hub_filename='paligemma-3b-mix-224.npz', - custom_load='hf', + hf_hub_id='timm/', num_classes=0), 'vit_so400m_patch14_siglip_gap_224.pali_pt': _cfg( - hf_hub_id='google/paligemma-3b-pt-224-jax', - hf_hub_filename='paligemma-3b-pt-224.npz', - custom_load='hf', + hf_hub_id='timm/', + num_classes=0), + 'vit_so400m_patch14_siglip_gap_224.pali2_3b_pt': _cfg( + hf_hub_id='timm/', num_classes=0), + 'vit_so400m_patch14_siglip_gap_224.pali2_10b_pt': _cfg( + hf_hub_id='timm/', + num_classes=0), + # 'vit_so400m_patch14_siglip_gap_224.pali2_28b_pt': _cfg( + # hf_hub_id='google/paligemma2-28b-pt-224-jax', + # hf_hub_filename='pt_27b_224.npz', + # custom_load='hf', + # num_classes=0), 'vit_so400m_patch16_siglip_gap_256.webli_i18n': _cfg( hf_hub_id='timm/', input_size=(3, 256, 256), @@ -1929,23 +1946,69 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: input_size=(3, 384, 384), crop_pct=1.0, num_classes=0), 'vit_so400m_patch14_siglip_gap_448.pali_mix': _cfg( - hf_hub_id='google/paligemma-3b-mix-448-jax', - hf_hub_filename='paligemma-3b-mix-448.npz', - custom_load='hf', + hf_hub_id='timm/', input_size=(3, 448, 448), crop_pct=1.0, num_classes=0), 'vit_so400m_patch14_siglip_gap_448.pali_pt': _cfg( - hf_hub_id='google/paligemma-3b-pt-448-jax', - hf_hub_filename='paligemma-3b-pt-448.npz', - custom_load='hf', + hf_hub_id='timm/', + input_size=(3, 448, 448), crop_pct=1.0, + num_classes=0), + 'vit_so400m_patch14_siglip_gap_448.pali_refcoco_seg': _cfg( + hf_hub_id='timm/', + input_size=(3, 448, 448), crop_pct=1.0, + num_classes=0), + 'vit_so400m_patch14_siglip_gap_448.pali_ocrvqa': _cfg( + hf_hub_id='timm/', + input_size=(3, 448, 448), crop_pct=1.0, + num_classes=0), + 'vit_so400m_patch14_siglip_gap_448.pali2_3b_pt': _cfg( + hf_hub_id='timm/', + input_size=(3, 448, 448), crop_pct=1.0, + num_classes=0), + 'vit_so400m_patch14_siglip_gap_448.pali2_10b_pt': _cfg( + hf_hub_id='timm/', + input_size=(3, 448, 448), crop_pct=1.0, + num_classes=0), + # 'vit_so400m_patch14_siglip_gap_448.pali2_28b_pt': _cfg( + # hf_hub_id='google/paligemma2-28b-pt-448-jax', + # hf_hub_filename='pt_27b_448.npz', + # custom_load='hf', + # input_size=(3, 448, 448), crop_pct=1.0, + # num_classes=0), + 'vit_so400m_patch14_siglip_gap_448.pali2_3b_docci': _cfg( + hf_hub_id='timm/', + input_size=(3, 448, 448), crop_pct=1.0, + num_classes=0), + 'vit_so400m_patch14_siglip_gap_448.pali2_10b_docci': _cfg( + hf_hub_id='timm/', input_size=(3, 448, 448), crop_pct=1.0, num_classes=0), 'vit_so400m_patch14_siglip_gap_896.pali_pt': _cfg( - hf_hub_id='google/paligemma-3b-pt-896-jax', - hf_hub_filename='paligemma-3b-pt-896.npz', - custom_load='hf', + hf_hub_id='timm/', + input_size=(3, 896, 896), crop_pct=1.0, + num_classes=0), + 'vit_so400m_patch14_siglip_gap_896.pali_refcoco_seg': _cfg( + hf_hub_id='timm/', + input_size=(3, 896, 896), crop_pct=1.0, + num_classes=0), + 'vit_so400m_patch14_siglip_gap_896.pali_ocrvqa': _cfg( + hf_hub_id='timm/', + input_size=(3, 896, 896), crop_pct=1.0, + num_classes=0), + 'vit_so400m_patch14_siglip_gap_896.pali2_3b_pt': _cfg( + hf_hub_id='timm/', + input_size=(3, 896, 896), crop_pct=1.0, + num_classes=0), + 'vit_so400m_patch14_siglip_gap_896.pali2_10b_pt': _cfg( + hf_hub_id='timm/', input_size=(3, 896, 896), crop_pct=1.0, num_classes=0), + # 'vit_so400m_patch14_siglip_gap_896.pali2_28b_pt': _cfg( + # hf_hub_id='google/paligemma2-28b-pt-896-jax', + # hf_hub_filename='pt_27b_896.npz', + # custom_load='hf', + # input_size=(3, 896, 896), crop_pct=1.0, + # num_classes=0), 'vit_so400m_patch14_siglip_378.webli_ft_in1k': _cfg( hf_hub_id='timm/', @@ -1958,22 +2021,18 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]: 'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m': _cfg( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', license='mit', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), 'vit_medium_patch32_clip_224.tinyclip_laion400m': _cfg( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', license='mit', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), 'vit_medium_patch16_clip_224.tinyclip_yfcc15m': _cfg( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', license='mit', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), 'vit_betwixt_patch32_clip_224.tinyclip_laion400m': _cfg( hf_hub_id='timm/', - hf_hub_filename='open_clip_pytorch_model.bin', license='mit', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),