Skip to content

Image-text Model

50 Models

Instruction Model

Alibaba-NLP/gme-Qwen2-VL-2B-Instruct

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.2B 1536 32.8K 8.2 GB 2024-12-24 cmn-Hans, eng-Latn
Citation
@misc{zhang2024gme,
      title={GME: Improving Universal Multimodal Retrieval by Multimodal LLMs},
      author={Zhang, Xin and Zhang, Yanzhao and Xie, Wen and Li, Mingxin and Dai, Ziqi and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Li, Wenjie and Zhang, Min},
      year={2024},
      eprint={2412.16855},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={http://arxiv.org/abs/2412.16855}
}

Alibaba-NLP/gme-Qwen2-VL-7B-Instruct

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
7.7B 3584 32.8K 30.9 GB 2024-12-24 cmn-Hans, eng-Latn
Citation
@misc{zhang2024gme,
      title={GME: Improving Universal Multimodal Retrieval by Multimodal LLMs},
      author={Zhang, Xin and Zhang, Yanzhao and Xie, Wen and Li, Mingxin and Dai, Ziqi and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Li, Wenjie and Zhang, Min},
      year={2024},
      eprint={2412.16855},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={http://arxiv.org/abs/2412.16855}
}

ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-3B-v1

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
3.0B 128 128.0K 7.0 GB 2025-11-04 eng-Latn

ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-7B-v1

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
7.0B 128 128.0K 14.1 GB 2025-11-04 eng-Latn

Bytedance/Seed1.6-embedding

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
not specified 2048 32.8K not specified 2025-06-18 eng-Latn, zho-Hans

OpenSearch-AI/Ops-Colqwen3-4B

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.8B 2560 32.8K 9.0 GB 2026-01-24 afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73)
Citation
@misc{ops_colqwen3_4b,
  author       = {OpenSearch-AI},
  title        = {Ops-ColQwen3: State-of-the-Art Multimodal Embedding Model for Visual Document Retrieval},
  year         = {2026},
  url          = {https://huggingface.co/OpenSearch-AI/Ops-ColQwen3-4B},
}

Qwen/Qwen3-VL-Embedding-2B

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.1B 2048 32.8K 7.5 GB 2026-01-08 eng-Latn
Citation
@article{qwen3vlembedding,
  title={Qwen3-VL-Embedding and Qwen3-VL-Reranker: A Unified Framework for State-of-the-Art Multimodal Retrieval and Ranking},
  author={Li, Mingxin and Zhang, Yanzhao and Long, Dingkun and Chen Keqin and Song, Sibo and Bai, Shuai and Yang, Zhibo and Xie, Pengjun and Yang, An and Liu, Dayiheng and Zhou, Jingren and Lin, Junyang},
  journal={arXiv preprint arXiv:2601.04720},
  year={2026}
}

Qwen/Qwen3-VL-Embedding-8B

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
8.1B 4096 32.8K 29.8 GB 2026-01-08 eng-Latn
Citation
@article{qwen3vlembedding,
  title={Qwen3-VL-Embedding and Qwen3-VL-Reranker: A Unified Framework for State-of-the-Art Multimodal Retrieval and Ranking},
  author={Li, Mingxin and Zhang, Yanzhao and Long, Dingkun and Chen Keqin and Song, Sibo and Bai, Shuai and Yang, Zhibo and Xie, Pengjun and Yang, An and Liu, Dayiheng and Zhou, Jingren and Lin, Junyang},
  journal={arXiv preprint arXiv:2601.04720},
  year={2026}
}

TIGER-Lab/VLM2Vec-Full

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.1B 3072 131.1K 7.7 GB 2024-10-08 eng-Latn
Citation
@article{jiang2024vlm2vec,
  title={VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks},
  author={Jiang, Ziyan and Meng, Rui and Yang, Xinyi and Yavuz, Semih and Zhou, Yingbo and Chen, Wenhu},
  journal={arXiv preprint arXiv:2410.05160},
  year={2024}
}

TIGER-Lab/VLM2Vec-LoRA

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.2B 3072 131.1K not specified 2024-10-08 eng-Latn
Citation
@article{jiang2024vlm2vec,
  title={VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks},
  author={Jiang, Ziyan and Meng, Rui and Yang, Xinyi and Yavuz, Semih and Zhou, Yingbo and Chen, Wenhu},
  journal={arXiv preprint arXiv:2410.05160},
  year={2024}
}

TomoroAI/tomoro-colqwen3-embed-4b

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.0B 320 262.1K 8.3 GB 2025-11-26 eng-Latn
Citation
@misc{huang2025tomoro_colqwen3_embed,
  title={TomoroAI/tomoro-colqwen3-embed},
  author={Xin Huang and Kye Min Tan and Albert Phelps},
  year={2025},
  url={https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-8b}
}

TomoroAI/tomoro-colqwen3-embed-8b

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
8.0B 320 262.1K 16.3 GB 2025-11-26 eng-Latn
Citation
@misc{huang2025tomoro_colqwen3_embed,
  title={TomoroAI/tomoro-colqwen3-embed},
  author={Xin Huang and Kye Min Tan and Albert Phelps},
  year={2025},
  url={https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-8b}
}

VAGOsolutions/SauerkrautLM-ColLFM2-450M-v0.1

License: https://huggingface.co/LiquidAI/LFM2-VL-450M/blob/main/LICENSE • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
451.0M 128 32.8K 860.0 MB 2025-12-20 deu-Latn, eng-Latn, fra-Latn, ita-Latn, por-Latn, ... (6)
Citation
@misc{sauerkrautlm-colpali-2025,
  title={SauerkrautLM-ColPali: Multi-Vector Vision Retrieval Models},
  author={David Golchinfar},
  organization={VAGO Solutions},
  year={2025},
  url={https://github.com/VAGOsolutions/sauerkrautlm-colpali}
}

@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

VAGOsolutions/SauerkrautLM-ColMinistral3-3b-v0.1

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.3B 128 262.1K 7.9 GB 2025-12-20 deu-Latn, eng-Latn, fra-Latn, ita-Latn, por-Latn, ... (6)
Citation
@misc{sauerkrautlm-colpali-2025,
  title={SauerkrautLM-ColPali: Multi-Vector Vision Retrieval Models},
  author={David Golchinfar},
  organization={VAGO Solutions},
  year={2025},
  url={https://github.com/VAGOsolutions/sauerkrautlm-colpali}
}

@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

VAGOsolutions/SauerkrautLM-ColQwen3-1.7b-Turbo-v0.1

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
1.8B 128 262.1K 3.3 GB 2025-12-20 deu-Latn, eng-Latn, fra-Latn, ita-Latn, por-Latn, ... (6)
Citation
@misc{sauerkrautlm-colpali-2025,
  title={SauerkrautLM-ColPali: Multi-Vector Vision Retrieval Models},
  author={David Golchinfar},
  organization={VAGO Solutions},
  year={2025},
  url={https://github.com/VAGOsolutions/sauerkrautlm-colpali}
}

@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

VAGOsolutions/SauerkrautLM-ColQwen3-2b-v0.1

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.1B 128 262.1K 4.0 GB 2025-12-20 deu-Latn, eng-Latn, fra-Latn, ita-Latn, por-Latn, ... (6)
Citation
@misc{sauerkrautlm-colpali-2025,
  title={SauerkrautLM-ColPali: Multi-Vector Vision Retrieval Models},
  author={David Golchinfar},
  organization={VAGO Solutions},
  year={2025},
  url={https://github.com/VAGOsolutions/sauerkrautlm-colpali}
}

@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

VAGOsolutions/SauerkrautLM-ColQwen3-4b-v0.1

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.4B 128 262.1K 8.3 GB 2025-12-20 deu-Latn, eng-Latn, fra-Latn, ita-Latn, por-Latn, ... (6)
Citation
@misc{sauerkrautlm-colpali-2025,
  title={SauerkrautLM-ColPali: Multi-Vector Vision Retrieval Models},
  author={David Golchinfar},
  organization={VAGO Solutions},
  year={2025},
  url={https://github.com/VAGOsolutions/sauerkrautlm-colpali}
}

@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

VAGOsolutions/SauerkrautLM-ColQwen3-8b-v0.1

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
8.1B 128 262.1K 15.2 GB 2025-12-20 deu-Latn, eng-Latn, fra-Latn, ita-Latn, por-Latn, ... (6)
Citation
@misc{sauerkrautlm-colpali-2025,
  title={SauerkrautLM-ColPali: Multi-Vector Vision Retrieval Models},
  author={David Golchinfar},
  organization={VAGO Solutions},
  year={2025},
  url={https://github.com/VAGOsolutions/sauerkrautlm-colpali}
}

@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

eagerworks/eager-embed-v1

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.4B 2560 262.1K 16.5 GB 2025-11-20 deu-Latn, eng-Latn, fra-Latn, spa-Latn
Citation
@article{EagerEmbed,
  title={Eager Embed V1: Multimodal Dense Embeddings for Retrieval},
  author={Juan Pablo Balarini},
  year={2025},
  publisher={Eagerworks},
  url={https://github.com/eagerworks/eager-embed},
}

ibm-granite/granite-vision-3.3-2b-embedding

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
3.0B 128 128.0K 11.1 GB 2025-06-11 eng-Latn
Citation
@article{karlinsky2025granitevision,
  title={Granite Vision: a lightweight, open-source multimodal model for enterprise Intelligence},
  author={Granite Vision Team and Karlinsky, Leonid and Arbelle, Assaf and Daniels, Abraham and Nassar, Ahmed and Alfassi, Amit and Wu, Bo and Schwartz, Eli and Joshi, Dhiraj and Kondic, Jovana and Shabtay, Nimrod and Li, Pengyuan and Herzig, Roei and Abedin, Shafiq and Perek, Shaked and Harary, Sivan and Barzelay, Udi and Raz Goldfarb, Adi and Oliva, Aude and Wieles, Ben and Bhattacharjee, Bishwaranjan and Huang, Brandon and Auer, Christoph and Gutfreund, Dan and Beymer, David and Wood, David and Kuehne, Hilde and Hansen, Jacob and Shtok, Joseph and Wong, Ken and Bathen, Luis Angel and Mishra, Mayank and Lysak, Maksym and Dolfi, Michele and Yurochkin, Mikhail and Livathinos, Nikolaos and Harel, Nimrod and Azulai, Ophir and Naparstek, Oshri and de Lima, Rafael Teixeira and Panda, Rameswar and Doveh, Sivan and Gupta, Shubham and Das, Subhro and Zawad, Syed and Kim, Yusik and He, Zexue and Brooks, Alexander and Goodhart, Gabe and Govindjee, Anita and Leist, Derek and Ibrahim, Ibrahim and Soffer, Aya and Cox, David and Soule, Kate and Lastras, Luis and Desai, Nirmit and Ofek-koifman, Shila and Raghavan, Sriram and Syeda-Mahmood, Tanveer and Staar, Peter and Drory, Tal and Feris, Rogerio},
  journal={arXiv preprint arXiv:2502.09927},
  year={2025}
}

intfloat/mmE5-mllama-11b-instruct

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
10.6B 4096 128.0K 19.8 GB 2025-02-12 eng-Latn
Citation
@article{chen2025mmE5,
  title={mmE5: Improving Multimodal Multilingual Embeddings via High-quality Synthetic Data},
  author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
  journal={arXiv preprint arXiv:2502.08468},
  year={2025}
}

jinaai/jina-clip-v1

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
222.6M 768 8.2K 849.0 MB 2024-05-30 eng-Latn
Citation
@article{koukounas2024jinaclip,
  title={Jina CLIP: Your CLIP Model Is Also Your Text Retriever},
  author={Koukounas, Andreas and Mastrapas, Georgios and Günther, Michael and Wang, Bo and Martens, Scott and Mohr, Isabelle and Sturua, Saba and Akram, Mohammad Kalim and Martínez, Joan Fontanals and Ognawala, Saahil and Guzman, Susana and Werk, Maximilian and Wang, Nan and Xiao, Han},
  journal={arXiv preprint arXiv:2405.20204},
  year={2024}
}

jinaai/jina-embeddings-v4

License: cc-by-nc-4.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
3.9B 2048 32.8K 7.3 GB 2025-06-24 afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99)
Citation
@misc{günther2025jinaembeddingsv4universalembeddingsmultimodal,
      title={jina-embeddings-v4: Universal Embeddings for Multimodal Multilingual Retrieval},
      author={Michael Günther and Saba Sturua and Mohammad Kalim Akram and Isabelle Mohr and Andrei Ungureanu and Sedigheh Eslami and Scott Martens and Bo Wang and Nan Wang and Han Xiao},
      year={2025},
      eprint={2506.18902},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
      url={https://arxiv.org/abs/2506.18902},
}

microsoft/LLM2CLIP-Openai-B-16

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
360.6M 1280 not specified not specified 2024-11-07 eng-Latn
Citation
@misc{huang2024llm2clippowerfullanguagemodel,
  title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
  author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
  year={2024},
  eprint={2411.04997},
  archivePrefix={arXiv},
  primaryClass={cs.CV},
  url={https://arxiv.org/abs/2411.04997}
}

microsoft/LLM2CLIP-Openai-L-14-224

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
578.0M 1280 not specified not specified 2024-11-07 eng-Latn
Citation
@misc{huang2024llm2clippowerfullanguagemodel,
  title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
  author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
  year={2024},
  eprint={2411.04997},
  archivePrefix={arXiv},
  primaryClass={cs.CV},
  url={https://arxiv.org/abs/2411.04997}
}

microsoft/LLM2CLIP-Openai-L-14-336

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
578.6M 1280 not specified not specified 2024-11-07 eng-Latn
Citation
@misc{huang2024llm2clippowerfullanguagemodel,
  title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
  author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
  year={2024},
  eprint={2411.04997},
  archivePrefix={arXiv},
  primaryClass={cs.CV},
  url={https://arxiv.org/abs/2411.04997}
}

nomic-ai/colnomic-embed-multimodal-3b

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
3.0B 128 128.0K 7.0 GB 2025-03-31 deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn
Citation
@misc{nomicembedmultimodal2025,
  title={Nomic Embed Multimodal: Interleaved Text, Image, and Screenshots for Visual Document Retrieval},
  author={Nomic Team},
  year={2025},
  publisher={Nomic AI},
  url={https://nomic.ai/blog/posts/nomic-embed-multimodal}
}

nomic-ai/colnomic-embed-multimodal-7b

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
7.0B 128 128.0K 14.1 GB 2025-03-31 deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn
Citation
@misc{nomicembedmultimodal2025,
  title={Nomic Embed Multimodal: Interleaved Text, Image, and Screenshots for Visual Document Retrieval},
  author={Nomic Team},
  year={2025},
  publisher={Nomic AI},
  url={https://nomic.ai/blog/posts/nomic-embed-multimodal}
}

nomic-ai/nomic-embed-multimodal-3b

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
3.8B 128 128.0K 6.1 GB 2025-04-15 deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn
Citation
@misc{nomicembedmultimodal2025,
  title={Nomic Embed Multimodal: Interleaved Text, Image, and Screenshots for Visual Document Retrieval},
  author={Nomic Team},
  year={2025},
  publisher={Nomic AI},
  url={https://www.nomic.ai/news/nomic-embed-multimodal}
}

nomic-ai/nomic-embed-multimodal-7b

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
7.8B 128 128.0K 14.1 GB 2025-04-15 deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn
Citation
@misc{nomicembedmultimodal2025,
  title={Nomic Embed Multimodal: Interleaved Text, Image, and Screenshots for Visual Document Retrieval},
  author={Nomic Team},
  year={2025},
  publisher={Nomic AI},
  url={https://www.nomic.ai/news/nomic-embed-multimodal}
}

nomic-ai/nomic-embed-vision-v1.5

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
92.9M 768 2.0K 355.0 MB 2024-06-08 eng-Latn
Citation
@article{nussbaum2024nomicembedvision,
      title={Nomic Embed Vision: Expanding the Latent Space},
      author={Nussbaum, Zach and Duderstadt, Brandon and Mulyar, Andriy},
      journal={arXiv preprint arXiv:2406.18587},
      year={2024},
      eprint={2406.18587},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2406.18587}
}

nvidia/llama-nemoretriever-colembed-1b-v1

License: https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.4B 2048 8.2K 4.5 GB 2025-06-27 eng-Latn
Citation
@misc{xu2025llamanemoretrievercolembedtopperforming,
      title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
      author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
      year={2025},
      eprint={2507.05513},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2507.05513}
}

nvidia/llama-nemoretriever-colembed-3b-v1

License: https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.4B 3072 8.2K 8.2 GB 2025-06-27 eng-Latn
Citation
@misc{xu2025llamanemoretrievercolembedtopperforming,
      title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
      author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
      year={2025},
      eprint={2507.05513},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2507.05513}
}

nvidia/llama-nemotron-colembed-vl-3b-v2

License: https://huggingface.co/nvidia/llama-nemotron-colembed-vl-3b-v2/blob/main/LICENSE • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.4B 3072 8.2K 8.2 GB 2026-01-21 eng-Latn
Citation
@misc{moreira2026nemotroncolembedv2topperforming,
    title={Nemotron ColEmbed V2: Top-Performing Late Interaction embedding models for Visual Document Retrieval},
    author={Gabriel de Souza P. Moreira and Ronay Ak and Mengyao Xu and Oliver Holworthy and Benedikt Schifferer and Zhiding Yu and Yauhen Babakhin and Radek Osmulski and Jiarui Cai and Ryan Chesler and Bo Liu and Even Oldridge},
    year={2026},
    eprint={2602.03992},
    archivePrefix={arXiv},
    primaryClass={cs.IR},
    url={https://arxiv.org/abs/2602.03992},
}

nvidia/llama-nemotron-embed-vl-1b-v2

License: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/ • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
1.7B 2048 10.2K 6.3 GB 2026-01-06 eng-Latn
Citation
@misc{ronay2026smallyetmighty,
    title={Small Yet Mighty: Improve Accuracy In Multimodal Search and Visual Document Retrieval with Llama Nemotron RAG Models},
    author={Ronay Ak, Gabriel de Souza Pereira Moreira and Bo Liu},
    year={2026},
    howpublished = {Available at: https://huggingface.co/blog/nvidia/llama-nemotron-vl-1b},
}

nvidia/nemotron-colembed-vl-4b-v2

License: https://huggingface.co/nvidia/nemotron-colembed-vl-4b-v2/blob/main/LICENSE • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.8B 2560 262.1K 9.0 GB 2026-01-07 eng-Latn
Citation
@misc{moreira2026nemotroncolembedv2topperforming,
    title={Nemotron ColEmbed V2: Top-Performing Late Interaction embedding models for Visual Document Retrieval},
    author={Gabriel de Souza P. Moreira and Ronay Ak and Mengyao Xu and Oliver Holworthy and Benedikt Schifferer and Zhiding Yu and Yauhen Babakhin and Radek Osmulski and Jiarui Cai and Ryan Chesler and Bo Liu and Even Oldridge},
    year={2026},
    eprint={2602.03992},
    archivePrefix={arXiv},
    primaryClass={cs.IR},
    url={https://arxiv.org/abs/2602.03992},
}

nvidia/nemotron-colembed-vl-8b-v2

License: https://huggingface.co/nvidia/nemotron-colembed-vl-8b-v2/blob/main/LICENSE • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
8.7B 4096 262.1K 16.3 GB 2026-01-07 eng-Latn
Citation
@misc{moreira2026nemotroncolembedv2topperforming,
    title={Nemotron ColEmbed V2: Top-Performing Late Interaction embedding models for Visual Document Retrieval},
    author={Gabriel de Souza P. Moreira and Ronay Ak and Mengyao Xu and Oliver Holworthy and Benedikt Schifferer and Zhiding Yu and Yauhen Babakhin and Radek Osmulski and Jiarui Cai and Ryan Chesler and Bo Liu and Even Oldridge},
    year={2026},
    eprint={2602.03992},
    archivePrefix={arXiv},
    primaryClass={cs.IR},
    url={https://arxiv.org/abs/2602.03992},
}

royokong/e5-v

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
8.4B 4096 8.2K 15.6 GB 2024-07-17 eng-Latn
Citation
@article{jiang2024e5v,
      title={E5-V: Universal Embeddings with Multimodal Large Language Models},
      author={Jiang, Ting and Song, Minghui and Zhang, Zihan and Huang, Haizhen and Deng, Weiwei and Sun, Feng and Zhang, Qi and Wang, Deqing and Zhuang, Fuzhen},
      journal={arXiv preprint arXiv:2407.12580},
      year={2024},
      eprint={2407.12580},
      archivePrefix={arXiv},
      url={https://arxiv.org/abs/2407.12580}
}

vidore/colSmol-256M

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
256.0M 128 8.2K 800.0 MB 2025-01-22 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colSmol-500M

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
500.0M 128 8.2K 1.2 GB 2025-01-22 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colpali-v1.1

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.9B 128 16.4K 4.6 GB 2024-08-21 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colpali-v1.2

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.9B 128 16.4K 4.6 GB 2024-08-26 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colpali-v1.3

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.9B 128 16.4K 4.6 GB 2024-11-01 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colqwen2-v1.0

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.2B 128 32.8K 7.0 GB 2025-11-03 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

vidore/colqwen2.5-v0.2

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
3.0B 128 128.0K 7.0 GB 2025-01-31 eng-Latn
Citation
@misc{faysse2024colpali,
  title={ColPali: Efficient Document Retrieval with Vision Language Models},
  author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
  year={2024},
  eprint={2407.01449},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

Non-instruction Model

BAAI/bge-visualized-base

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
196.0M 768 512 1.6 GB 2024-06-06 eng-Latn
Citation
@article{zhou2024vista,
  title={VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval},
  author={Zhou, Junjie and Liu, Zheng and Xiao, Shitao and Zhao, Bo and Xiong, Yongping},
  journal={arXiv preprint arXiv:2406.04292},
  year={2024}
}

BAAI/bge-visualized-m3

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
872.9M 1024 8.2K 4.2 GB 2024-06-06 eng-Latn
Citation
@article{zhou2024vista,
  title={VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval},
  author={Zhou, Junjie and Liu, Zheng and Xiao, Shitao and Zhao, Bo and Xiong, Yongping},
  journal={arXiv preprint arXiv:2406.04292},
  year={2024}
}

Cohere/Cohere-embed-v4.0

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
not specified 1536 128.0K not specified 2024-12-01 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111)

Cohere/Cohere-embed-v4.0 (output_dtype=binary)

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
not specified 1536 128.0K not specified 2024-12-01 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111)

Cohere/Cohere-embed-v4.0 (output_dtype=int8)

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
not specified 1536 128.0K not specified 2024-12-01 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111)

QuanSun/EVA02-CLIP-B-16

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
149.0M 512 77 568.0 MB 2023-04-26 eng-Latn
Citation
@article{EVA-CLIP,
      title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
      author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
      journal={arXiv preprint arXiv:2303.15389},
      year={2023}
}

QuanSun/EVA02-CLIP-L-14

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
428.0M 768 77 1.6 GB 2023-04-26 eng-Latn
Citation
@article{EVA-CLIP,
      title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
      author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
      journal={arXiv preprint arXiv:2303.15389},
      year={2023}
}

QuanSun/EVA02-CLIP-bigE-14

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.7B 1024 77 17.5 GB 2023-04-26 eng-Latn
Citation
@article{EVA-CLIP,
      title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
      author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
      journal={arXiv preprint arXiv:2303.15389},
      year={2023}
}

QuanSun/EVA02-CLIP-bigE-14-plus

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
5.0B 1024 77 18.6 GB 2023-04-26 eng-Latn
Citation
@article{EVA-CLIP,
      title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
      author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
      journal={arXiv preprint arXiv:2303.15389},
      year={2023}
}

Salesforce/blip-image-captioning-base

License: bsd-3-clause • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
224.7M 768 512 942.0 MB 2023-08-01 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-image-captioning-large

License: bsd-3-clause • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
446.1M 768 512 1.8 GB 2023-12-07 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-itm-base-coco

License: bsd-3-clause • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
223.7M 768 512 942.0 MB 2023-08-01 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-itm-base-flickr

License: bsd-3-clause • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
223.7M 768 512 942.0 MB 2023-08-01 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-itm-large-coco

License: bsd-3-clause • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
446.1M 768 512 1.8 GB 2023-08-01 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-itm-large-flickr

License: bsd-3-clause • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
446.1M 768 512 1.8 GB 2023-08-01 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-vqa-base

License: bsd-3-clause • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
384.7M 768 512 1.4 GB 2023-12-07 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip-vqa-capfilt-large

License: bsd-3-clause • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
384.7M 768 512 942.0 MB 2023-01-22 eng-Latn
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
    doi = {10.48550/ARXIV.2201.12086},
    url = {https://arxiv.org/abs/2201.12086},
    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution 4.0 International}
}

Salesforce/blip2-opt-2.7b

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
3.7B 768 not specified 14.0 GB 2024-03-22 eng-Latn
Citation
@inproceedings{li2023blip2,
    title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
    author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
    year={2023},
    booktitle={ICML},
}

Salesforce/blip2-opt-6.7b-coco

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
7.8B 768 not specified 28.9 GB 2024-03-31 eng-Latn
Citation
@inproceedings{li2023blip2,
    title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
    author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
    year={2023},
    booktitle={ICML},
}

athrael-soju/colqwen3.5-4.5B-v3

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.6B 128 262.1K 8.5 GB 2026-03-15 eng-Latn

cohere/embed-english-v3.0

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
not specified 1024 not specified not specified 2024-10-24 eng-Latn

cohere/embed-multilingual-v3.0

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
not specified 1024 not specified not specified 2024-10-24 not specified

facebook/metaclip-2-mt5-worldwide-b32

License: cc-by-nc-4.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
254.0M 512 77 969.0 MB 2025-11-12 afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99)
Citation
@article{xu2025metaclip2,
  title={MetaCLIP 2: A Worldwide Scaling Recipe},
  author={Xu, Hu and Xie, Saining and Ghosh, Gargi and Kira, Zsolt and Darrell, Trevor},
  journal={arXiv preprint arXiv:2507.22062},
  year={2025}
}

google/siglip-base-patch16-224

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
203.2M 768 64 775.0 MB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-base-patch16-256

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
203.2M 768 64 775.0 MB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-base-patch16-256-multilingual

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
370.6M 768 64 1.4 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-base-patch16-384

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
203.4M 768 64 776.0 MB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-base-patch16-512

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
203.8M 768 64 777.0 MB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-large-patch16-256

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
652.2M 1024 64 2.4 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-large-patch16-384

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
652.5M 1024 64 2.4 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-so400m-patch14-224

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
877.4M 1152 16 3.3 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-so400m-patch14-384

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
878.0M 1152 64 3.3 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

google/siglip-so400m-patch16-256-i18n

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
1.1B 1152 64 4.2 GB 2024-01-08 eng-Latn
Citation
@misc{zhai2023sigmoid,
      title={Sigmoid Loss for Language Image Pre-Training},
      author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
      year={2023},
      eprint={2303.15343},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

jinaai/jina-clip-v2

License: cc-by-nc-4.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
865.3M 1024 8.2K 1.6 GB 2024-10-09 eng-Latn
Citation
@misc{koukounas2024jinaclipv2multilingualmultimodalembeddings,
      title={jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images},
      author={Andreas Koukounas and Georgios Mastrapas and Bo Wang and Mohammad Kalim Akram and Sedigheh Eslami and Michael Günther and Isabelle Mohr and Saba Sturua and Scott Martens and Nan Wang and Han Xiao},
      year={2024},
      eprint={2412.08802},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2412.08802},
}

kakaobrain/align-base

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
172.1M 768 64 671.0 MB 2023-02-24 eng-Latn
Citation
@misc{kakaobrain2022coyo-align,
    title         = {COYO-ALIGN},
    author        = {Yoon, Boogeo and Lee, Youhan and Baek, Woonhyuk},
    year          = {2022},
    howpublished  = {https://github.com/kakaobrain/coyo-align},
}

laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
150.0M 512 77 572.0 MB 2023-04-26 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
151.0M 512 77 576.0 MB 2023-04-26 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-B-32-laion2B-s34B-b79K

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
151.3M 512 77 577.0 MB 2022-09-15 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-H-14-laion2B-s32B-b79K

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
986.1M 1024 77 3.7 GB 2022-09-15 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
427.6M 768 77 1.6 GB 2023-04-26 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-L-14-laion2B-s32B-b82K

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
427.6M 768 77 1.6 GB 2022-09-15 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-bigG-14-laion2B-39B-b160k

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.5B 1280 77 9.5 GB 2023-01-23 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

laion/CLIP-ViT-g-14-laion2B-s34B-b88K

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
1.4B 1024 77 5.1 GB 2023-03-06 eng-Latn
Citation
@inproceedings{cherti2023reproducible,
    title={Reproducible scaling laws for contrastive language-image learning},
    author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
    pages={2818--2829},
    year={2023}
}

nanovdr/NanoVDR-S-Multi

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
69.0M 2048 512 282.0 MB 2026-02-26 deu-Latn, eng-Latn, fra-Latn, ita-Latn, por-Latn, ... (6)
Citation
@article{nanovdr2026,
  title={NanoVDR: Distilling a 2B Vision-Language Retriever into a 70M Text-Only Encoder for Visual Document Retrieval},
  author={Liu, Zhuchenyang and Zhang, Yao and Xiao, Yu},
  journal={arXiv preprint arXiv:2603.12824},
  year={2026}
}

openai/clip-vit-base-patch16

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
149.6M 512 77 576.0 MB 2021-02-26 eng-Latn
Citation
@article{radford2021learning,
  title={Learning Transferable Visual Models From Natural Language Supervision},
  author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
  journal={arXiv preprint arXiv:2103.00020},
  year={2021}
}

openai/clip-vit-base-patch32

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
151.3M 512 77 576.0 MB 2021-02-26 eng-Latn
Citation
@article{radford2021learning,
  title={Learning Transferable Visual Models From Natural Language Supervision},
  author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
  journal={arXiv preprint arXiv:2103.00020},
  year={2021}
}

openai/clip-vit-large-patch14

License: not specified • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
427.6M 768 77 1.6 GB 2021-02-26 eng-Latn
Citation
@article{radford2021learning,
  title={Learning Transferable Visual Models From Natural Language Supervision},
  author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
  journal={arXiv preprint arXiv:2103.00020},
  year={2021}
}

voyageai/voyage-multimodal-3

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
not specified 1024 32.8K not specified 2024-11-10 not specified

webAI-Official/webAI-ColVec1-4b

License: multiple • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.5B 640 262.1K 8.5 GB 2026-04-05 eng-Latn, fra-Latn
Citation
@misc{webAI-ColVec1,
  title={webAI-ColVec1: Late-Interaction Multi-Vector Embedding Model for Visual Document Retrieval},
  author={webAI},
  year={2026},
  url={https://huggingface.co/webAI-Official/webAI-ColVec1-4b}
}

webAI-Official/webAI-ColVec1-9b

License: multiple • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
9.4B 2560 262.1K 17.5 GB 2026-04-05 eng-Latn, fra-Latn
Citation
@misc{webAI-ColVec1,
  title={webAI-ColVec1: Late-Interaction Multi-Vector Embedding Model for Visual Document Retrieval},
  author={webAI},
  year={2026},
  url={https://huggingface.co/webAI-Official/webAI-ColVec1-4b}
}