Skip to content

Image-text Model

  • Number of models: 46

Instruction Model

Alibaba-NLP/gme-Qwen2-VL-2B-Instruct

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 1536 2.2B 8.2 GB 2024-12-24 cmn-Hans, eng-Latn
Citation
@misc{zhang2024gme,
      title={GME: Improving Universal Multimodal Retrieval by Multimodal LLMs},
      author={Zhang, Xin and Zhang, Yanzhao and Xie, Wen and Li, Mingxin and Dai, Ziqi and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Li, Wenjie and Zhang, Min},
      year={2024},
      eprint={2412.16855},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={http://arxiv.org/abs/2412.16855}
}

Alibaba-NLP/gme-Qwen2-VL-7B-Instruct

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 3584 8.3B 30.9 GB 2024-12-24 cmn-Hans, eng-Latn
Citation
@misc{zhang2024gme,
      title={GME: Improving Universal Multimodal Retrieval by Multimodal LLMs},
      author={Zhang, Xin and Zhang, Yanzhao and Xie, Wen and Li, Mingxin and Dai, Ziqi and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Li, Wenjie and Zhang, Min},
      year={2024},
      eprint={2412.16855},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={http://arxiv.org/abs/2412.16855}
}

ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-3B-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 128 3.0B 7.0 GB 2025-11-04 eng-Latn

ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-7B-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 128 7.0B 14.1 GB 2025-11-04 eng-Latn

TIGER-Lab/VLM2Vec-Full

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
131.1K 3072 4.2B 7.7 GB 2024-10-08 eng-Latn
Citation
@article{jiang2024vlm2vec,
  title={VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks},
  author={Jiang, Ziyan and Meng, Rui and Yang, Xinyi and Yavuz, Semih and Zhou, Yingbo and Chen, Wenhu},
  journal={arXiv preprint arXiv:2410.05160},
  year={2024}
}

TIGER-Lab/VLM2Vec-LoRA

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
131.1K 3072 not specified not specified 2024-10-08 eng-Latn
Citation
@article{jiang2024vlm2vec,
  title={VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks},
  author={Jiang, Ziyan and Meng, Rui and Yang, Xinyi and Yavuz, Semih and Zhou, Yingbo and Chen, Wenhu},
  journal={arXiv preprint arXiv:2410.05160},
  year={2024}
}

ibm-granite/granite-vision-3.3-2b-embedding

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 128 3.0B 11.1 GB 2025-06-11 eng-Latn
Citation
@article{karlinsky2025granitevision,
  title={Granite Vision: a lightweight, open-source multimodal model for enterprise Intelligence},
  author={Granite Vision Team and Karlinsky, Leonid and Arbelle, Assaf and Daniels, Abraham and Nassar, Ahmed and Alfassi, Amit and Wu, Bo and Schwartz, Eli and Joshi, Dhiraj and Kondic, Jovana and Shabtay, Nimrod and Li, Pengyuan and Herzig, Roei and Abedin, Shafiq and Perek, Shaked and Harary, Sivan and Barzelay, Udi and Raz Goldfarb, Adi and Oliva, Aude and Wieles, Ben and Bhattacharjee, Bishwaranjan and Huang, Brandon and Auer, Christoph and Gutfreund, Dan and Beymer, David and Wood, David and Kuehne, Hilde and Hansen, Jacob and Shtok, Joseph and Wong, Ken and Bathen, Luis Angel and Mishra, Mayank and Lysak, Maksym and Dolfi, Michele and Yurochkin, Mikhail and Livathinos, Nikolaos and Harel, Nimrod and Azulai, Ophir and Naparstek, Oshri and de Lima, Rafael Teixeira and Panda, Rameswar and Doveh, Sivan and Gupta, Shubham and Das, Subhro and Zawad, Syed and Kim, Yusik and He, Zexue and Brooks, Alexander and Goodhart, Gabe and Govindjee, Anita and Leist, Derek and Ibrahim, Ibrahim and Soffer, Aya and Cox, David and Soule, Kate and Lastras, Luis and Desai, Nirmit and Ofek-koifman, Shila and Raghavan, Sriram and Syeda-Mahmood, Tanveer and Staar, Peter and Drory, Tal and Feris, Rogerio},
  journal={arXiv preprint arXiv:2502.09927},
  year={2025}
}

intfloat/mmE5-mllama-11b-instruct

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 4096 10.6B 19.8 GB 2025-02-12 eng-Latn
Citation
@article{chen2025mmE5,
  title={mmE5: Improving Multimodal Multilingual Embeddings via High-quality Synthetic Data},
  author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
  journal={arXiv preprint arXiv:2502.08468},
  year={2025}
}

jinaai/jina-clip-v1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 768 223.0M 849.0 MB 2024-05-30 eng-Latn
Citation
@article{koukounas2024jinaclip,
  title={Jina CLIP: Your CLIP Model Is Also Your Text Retriever},
  author={Koukounas, Andreas and Mastrapas, Georgios and Günther, Michael and Wang, Bo and Martens, Scott and Mohr, Isabelle and Sturua, Saba and Akram, Mohammad Kalim and Martínez, Joan Fontanals and Ognawala, Saahil and Guzman, Susana and Werk, Maximilian and Wang, Nan and Xiao, Han},
  journal={arXiv preprint arXiv:2405.20204},
  year={2024}
}

jinaai/jina-embeddings-v4

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 2048 3.8B 7.3 GB 2025-06-24 afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99)
Citation
@misc{günther2025jinaembeddingsv4universalembeddingsmultimodal,
      title={jina-embeddings-v4: Universal Embeddings for Multimodal Multilingual Retrieval},
      author={Michael Günther and Saba Sturua and Mohammad Kalim Akram and Isabelle Mohr and Andrei Ungureanu and Sedigheh Eslami and Scott Martens and Bo Wang and Nan Wang and Han Xiao},
      year={2025},
      eprint={2506.18902},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
      url={https://arxiv.org/abs/2506.18902},
}

microsoft/LLM2CLIP-Openai-B-16

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1280 361.0M not specified 2024-11-07 eng-Latn
Citation
@misc{huang2024llm2clippowerfullanguagemodel,
  title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
  author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
  year={2024},
  eprint={2411.04997},
  archivePrefix={arXiv},
  primaryClass={cs.CV},
  url={https://arxiv.org/abs/2411.04997}
}

microsoft/LLM2CLIP-Openai-L-14-224

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1280 578.0M not specified 2024-11-07 eng-Latn
Citation
@misc{huang2024llm2clippowerfullanguagemodel,
  title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
  author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
  year={2024},
  eprint={2411.04997},
  archivePrefix={arXiv},
  primaryClass={cs.CV},
  url={https://arxiv.org/abs/2411.04997}
}

microsoft/LLM2CLIP-Openai-L-14-336

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1280 579.0M not specified 2024-11-07 eng-Latn
Citation
@misc{huang2024llm2clippowerfullanguagemodel,
  title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
  author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
  year={2024},
  eprint={2411.04997},
  archivePrefix={arXiv},
  primaryClass={cs.CV},
  url={https://arxiv.org/abs/2411.04997}
}

nomic-ai/colnomic-embed-multimodal-3b

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 128 3.0B 7.0 GB 2025-03-31 deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn
Citation
@misc{nomicembedmultimodal2025,
  title={Nomic Embed Multimodal: Interleaved Text, Image, and Screenshots for Visual Document Retrieval},
  author={Nomic Team},
  year={2025},
  publisher={Nomic AI},
  url={https://nomic.ai/blog/posts/nomic-embed-multimodal}
}

nomic-ai/colnomic-embed-multimodal-7b

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 128 7.0B 14.1 GB 2025-03-31 deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn
Citation
@misc{nomicembedmultimodal2025,
  title={Nomic Embed Multimodal: Interleaved Text, Image, and Screenshots for Visual Document Retrieval},
  author={Nomic Team},
  year={2025},
  publisher={Nomic AI},
  url={https://nomic.ai/blog/posts/nomic-embed-multimodal}
}

nomic-ai/nomic-embed-vision-v1.5

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
2.0K 768 92.9M 355.0 MB 2024-06-08 eng-Latn
Citation
@article{nussbaum2024nomicembedvision,
      title={Nomic Embed Vision: Expanding the Latent Space},
      author={Nussbaum, Zach and Duderstadt, Brandon and Mulyar, Andriy},
      journal={arXiv preprint arXiv:2406.18587},
      year={2024},
      eprint={2406.18587},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2406.18587}
}

nvidia/llama-nemoretriever-colembed-1b-v1

License: https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 2048 2.4B 9.0 GB 2025-06-27 eng-Latn
Citation
@misc{xu2025llamanemoretrievercolembedtopperforming,
      title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
      author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
      year={2025},
      eprint={2507.05513},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2507.05513}
}

nvidia/llama-nemoretriever-colembed-3b-v1

License: https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 3072 4.4B 16.4 GB 2025-06-27 eng-Latn
Citation
@misc{xu2025llamanemoretrievercolembedtopperforming,
      title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
      author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
      year={2025},
      eprint={2507.05513},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2507.05513}
}

royokong/e5-v

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 4096 8.4B 15.6 GB 2024-07-17 eng-Latn
Citation
@article{jiang2024e5v,
      title={E5-V: Universal Embeddings with Multimodal Large Language Models},
      author={Jiang, Ting and Song, Minghui and Zhang, Zihan and Huang, Haizhen and Deng, Weiwei and Sun, Feng and Zhang, Qi and Wang, Deqing and Zhuang, Fuzhen},
      journal={arXiv preprint arXiv:2407.12580},
      year={2024},
      eprint={2407.12580},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2407.12580}
}

vidore/colSmol-256M

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 128 256.0M 800.0 MB 2025-01-22 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colSmol-500M

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 128 500.0M 1.2 GB 2025-01-22 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colpali-v1.1

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
16.4K 128 2.9B 4.6 GB 2024-08-21 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colpali-v1.2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
16.4K 128 2.9B 4.6 GB 2024-08-26 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colpali-v1.3

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
16.4K 128 2.9B 4.6 GB 2024-11-01 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colqwen2-v1.0

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 128 2.2B 7.0 GB 2025-11-03 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colqwen2.5-v0.2

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 128 3.0B 7.0 GB 2025-01-31 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

Non-instruction Model

BAAI/bge-visualized-base

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 196.0M 1.6 GB 2024-06-06 eng-Latn
Citation
@article{zhou2024vista,
  title={VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval},
  author={Zhou, Junjie and Liu, Zheng and Xiao, Shitao and Zhao, Bo and Xiong, Yongping},
  journal={arXiv preprint arXiv:2406.04292},
  year={2024}
}

BAAI/bge-visualized-m3

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
8.2K 1024 872.9M 4.2 GB 2024-06-06 eng-Latn
Citation
@article{zhou2024vista,
  title={VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval},
  author={Zhou, Junjie and Liu, Zheng and Xiao, Shitao and Zhao, Bo and Xiong, Yongping},
  journal={arXiv preprint arXiv:2406.04292},
  year={2024}
}

Cohere/Cohere-embed-v4.0

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 1536 not specified not specified 2024-12-01 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111)

Cohere/Cohere-embed-v4.0 (output_dtype=binary)

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 1536 not specified not specified 2024-12-01 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111)

Cohere/Cohere-embed-v4.0 (output_dtype=int8)

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
128.0K 1536 not specified not specified 2024-12-01 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111)

QuanSun/EVA02-CLIP-B-16

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 512 149.0M 568.0 MB 2023-04-26 eng-Latn
Citation
@article{EVA-CLIP,
      title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
      author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
      journal={arXiv preprint arXiv:2303.15389},
      year={2023}
}

QuanSun/EVA02-CLIP-L-14

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 768 428.0M 1.6 GB 2023-04-26 eng-Latn
Citation
@article{EVA-CLIP,
      title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
      author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
      journal={arXiv preprint arXiv:2303.15389},
      year={2023}
}

QuanSun/EVA02-CLIP-bigE-14

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 1024 4.7B 17.5 GB 2023-04-26 eng-Latn
Citation
@article{EVA-CLIP,
      title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
      author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
      journal={arXiv preprint arXiv:2303.15389},
      year={2023}
}

QuanSun/EVA02-CLIP-bigE-14-plus

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 1024 5.0B 18.6 GB 2023-04-26 eng-Latn
Citation
@article{EVA-CLIP,
      title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
      author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
      journal={arXiv preprint arXiv:2303.15389},
      year={2023}
}

Salesforce/blip-image-captioning-base

License: bsd-3-clause

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 247.0M 942.0 MB 2023-08-01 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-image-captioning-large

License: bsd-3-clause

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 470.0M 1.8 GB 2023-12-07 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-itm-base-coco

License: bsd-3-clause

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 247.0M 942.0 MB 2023-08-01 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-itm-base-flickr

License: bsd-3-clause

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 247.0M 942.0 MB 2023-08-01 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-itm-large-coco

License: bsd-3-clause

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 470.0M 1.8 GB 2023-08-01 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-itm-large-flickr

License: bsd-3-clause

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 470.0M 1.8 GB 2023-08-01 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-vqa-base

License: bsd-3-clause

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 247.0M 1.4 GB 2023-12-07 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-vqa-capfilt-large

License: bsd-3-clause

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
512 768 247.0M 942.0 MB 2023-01-22 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip2-opt-2.7b

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 768 3.7B 14.0 GB 2024-03-22 eng-Latn
Citation
@inproceedings{li2023blip2,
    title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
    author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
    year={2023},
    booktitle={ICML},
}

Salesforce/blip2-opt-6.7b-coco

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 768 7.8B 28.9 GB 2024-03-31 eng-Latn
Citation
@inproceedings{li2023blip2,
    title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
    author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
    year={2023},
    booktitle={ICML},
}

baseline/random-cross-encoder-baseline

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 32 0 0.0 MB not specified not specified

baseline/random-encoder-baseline

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 32 0 0.0 MB not specified not specified

cohere/embed-english-v3.0

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1024 not specified not specified 2024-10-24 eng-Latn

cohere/embed-multilingual-v3.0

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1024 not specified not specified 2024-10-24 not specified

google/siglip-base-patch16-224

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
64 768 203.0M 775.0 MB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-base-patch16-256

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
64 768 203.0M 775.0 MB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-base-patch16-256-multilingual

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
64 768 371.0M 1.4 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-base-patch16-384

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
64 768 203.0M 776.0 MB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-base-patch16-512

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
64 768 204.0M 777.0 MB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-large-patch16-256

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
64 1024 652.0M 2.4 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-large-patch16-384

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
64 1024 652.0M 2.4 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-so400m-patch14-224

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
16 1152 877.0M 3.3 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-so400m-patch14-384

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
64 1152 878.0M 3.3 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-so400m-patch16-256-i18n

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
64 1152 1.1B 4.2 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

kakaobrain/align-base

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
64 768 176.0M 671.0 MB 2023-02-24 eng-Latn
Citation
@misc{kakaobrain2022coyo-align,
    title         = {COYO-ALIGN},
    author        = {Yoon, Boogeo and Lee, Youhan and Baek, Woonhyuk},
    year          = {2022},
    howpublished  = {https://github.com/kakaobrain/coyo-align},
}

laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 512 150.0M 572.0 MB 2023-04-26 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 512 151.0M 576.0 MB 2023-04-26 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-B-32-laion2B-s34B-b79K

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 512 151.0M 577.0 MB 2022-09-15 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-H-14-laion2B-s32B-b79K

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 1024 986.0M 3.7 GB 2022-09-15 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 768 428.0M 1.6 GB 2023-04-26 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-L-14-laion2B-s32B-b82K

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 768 428.0M 1.6 GB 2022-09-15 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-bigG-14-laion2B-39B-b160k

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 1280 2.5B 9.5 GB 2023-01-23 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-g-14-laion2B-s34B-b88K

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 1024 1.4B 5.1 GB 2023-03-06 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

openai/clip-vit-base-patch16

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 512 151.0M 576.0 MB 2021-02-26 eng-Latn
Citation
@article{radford2021learning,
  title={Learning Transferable Visual Models From Natural Language Supervision},
  author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
  journal={arXiv preprint arXiv:2103.00020},
  year={2021}
}

openai/clip-vit-base-patch32

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 512 151.0M 576.0 MB 2021-02-26 eng-Latn
Citation
@article{radford2021learning,
  title={Learning Transferable Visual Models From Natural Language Supervision},
  author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
  journal={arXiv preprint arXiv:2103.00020},
  year={2021}
}

openai/clip-vit-large-patch14

License: not specified

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
77 768 428.0M 1.6 GB 2021-02-26 eng-Latn
Citation
@article{radford2021learning,
  title={Learning Transferable Visual Models From Natural Language Supervision},
  author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
  journal={arXiv preprint arXiv:2103.00020},
  year={2021}
}

voyageai/voyage-multimodal-3

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 1024 not specified not specified 2024-11-10 not specified