Image-text Model¶
- Number of models: 46
Instruction Model¶
Alibaba-NLP/gme-Qwen2-VL-2B-Instruct¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 32.8K | 1536 | 2.2B | 8.2 GB | 2024-12-24 | cmn-Hans, eng-Latn |
Citation
@misc{zhang2024gme,
title={GME: Improving Universal Multimodal Retrieval by Multimodal LLMs},
author={Zhang, Xin and Zhang, Yanzhao and Xie, Wen and Li, Mingxin and Dai, Ziqi and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Li, Wenjie and Zhang, Min},
year={2024},
eprint={2412.16855},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={http://arxiv.org/abs/2412.16855}
}
Alibaba-NLP/gme-Qwen2-VL-7B-Instruct¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 32.8K | 3584 | 8.3B | 30.9 GB | 2024-12-24 | cmn-Hans, eng-Latn |
Citation
@misc{zhang2024gme,
title={GME: Improving Universal Multimodal Retrieval by Multimodal LLMs},
author={Zhang, Xin and Zhang, Yanzhao and Xie, Wen and Li, Mingxin and Dai, Ziqi and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Li, Wenjie and Zhang, Min},
year={2024},
eprint={2412.16855},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={http://arxiv.org/abs/2412.16855}
}
ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-3B-v1¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 128.0K | 128 | 3.0B | 7.0 GB | 2025-11-04 | eng-Latn |
ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-7B-v1¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 128.0K | 128 | 7.0B | 14.1 GB | 2025-11-04 | eng-Latn |
TIGER-Lab/VLM2Vec-Full¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 131.1K | 3072 | 4.2B | 7.7 GB | 2024-10-08 | eng-Latn |
Citation
@article{jiang2024vlm2vec,
title={VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks},
author={Jiang, Ziyan and Meng, Rui and Yang, Xinyi and Yavuz, Semih and Zhou, Yingbo and Chen, Wenhu},
journal={arXiv preprint arXiv:2410.05160},
year={2024}
}
TIGER-Lab/VLM2Vec-LoRA¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 131.1K | 3072 | not specified | not specified | 2024-10-08 | eng-Latn |
Citation
@article{jiang2024vlm2vec,
title={VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks},
author={Jiang, Ziyan and Meng, Rui and Yang, Xinyi and Yavuz, Semih and Zhou, Yingbo and Chen, Wenhu},
journal={arXiv preprint arXiv:2410.05160},
year={2024}
}
ibm-granite/granite-vision-3.3-2b-embedding¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 128.0K | 128 | 3.0B | 11.1 GB | 2025-06-11 | eng-Latn |
Citation
@article{karlinsky2025granitevision,
title={Granite Vision: a lightweight, open-source multimodal model for enterprise Intelligence},
author={Granite Vision Team and Karlinsky, Leonid and Arbelle, Assaf and Daniels, Abraham and Nassar, Ahmed and Alfassi, Amit and Wu, Bo and Schwartz, Eli and Joshi, Dhiraj and Kondic, Jovana and Shabtay, Nimrod and Li, Pengyuan and Herzig, Roei and Abedin, Shafiq and Perek, Shaked and Harary, Sivan and Barzelay, Udi and Raz Goldfarb, Adi and Oliva, Aude and Wieles, Ben and Bhattacharjee, Bishwaranjan and Huang, Brandon and Auer, Christoph and Gutfreund, Dan and Beymer, David and Wood, David and Kuehne, Hilde and Hansen, Jacob and Shtok, Joseph and Wong, Ken and Bathen, Luis Angel and Mishra, Mayank and Lysak, Maksym and Dolfi, Michele and Yurochkin, Mikhail and Livathinos, Nikolaos and Harel, Nimrod and Azulai, Ophir and Naparstek, Oshri and de Lima, Rafael Teixeira and Panda, Rameswar and Doveh, Sivan and Gupta, Shubham and Das, Subhro and Zawad, Syed and Kim, Yusik and He, Zexue and Brooks, Alexander and Goodhart, Gabe and Govindjee, Anita and Leist, Derek and Ibrahim, Ibrahim and Soffer, Aya and Cox, David and Soule, Kate and Lastras, Luis and Desai, Nirmit and Ofek-koifman, Shila and Raghavan, Sriram and Syeda-Mahmood, Tanveer and Staar, Peter and Drory, Tal and Feris, Rogerio},
journal={arXiv preprint arXiv:2502.09927},
year={2025}
}
intfloat/mmE5-mllama-11b-instruct¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 128.0K | 4096 | 10.6B | 19.8 GB | 2025-02-12 | eng-Latn |
Citation
@article{chen2025mmE5,
title={mmE5: Improving Multimodal Multilingual Embeddings via High-quality Synthetic Data},
author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
journal={arXiv preprint arXiv:2502.08468},
year={2025}
}
jinaai/jina-clip-v1¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 8.2K | 768 | 223.0M | 849.0 MB | 2024-05-30 | eng-Latn |
Citation
@article{koukounas2024jinaclip,
title={Jina CLIP: Your CLIP Model Is Also Your Text Retriever},
author={Koukounas, Andreas and Mastrapas, Georgios and Günther, Michael and Wang, Bo and Martens, Scott and Mohr, Isabelle and Sturua, Saba and Akram, Mohammad Kalim and Martínez, Joan Fontanals and Ognawala, Saahil and Guzman, Susana and Werk, Maximilian and Wang, Nan and Xiao, Han},
journal={arXiv preprint arXiv:2405.20204},
year={2024}
}
jinaai/jina-embeddings-v4¶
License: cc-by-nc-4.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 32.8K | 2048 | 3.8B | 7.3 GB | 2025-06-24 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
Citation
@misc{günther2025jinaembeddingsv4universalembeddingsmultimodal,
title={jina-embeddings-v4: Universal Embeddings for Multimodal Multilingual Retrieval},
author={Michael Günther and Saba Sturua and Mohammad Kalim Akram and Isabelle Mohr and Andrei Ungureanu and Sedigheh Eslami and Scott Martens and Bo Wang and Nan Wang and Han Xiao},
year={2025},
eprint={2506.18902},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2506.18902},
}
microsoft/LLM2CLIP-Openai-B-16¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1280 | 361.0M | not specified | 2024-11-07 | eng-Latn |
Citation
@misc{huang2024llm2clippowerfullanguagemodel,
title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
year={2024},
eprint={2411.04997},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2411.04997}
}
microsoft/LLM2CLIP-Openai-L-14-224¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1280 | 578.0M | not specified | 2024-11-07 | eng-Latn |
Citation
@misc{huang2024llm2clippowerfullanguagemodel,
title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
year={2024},
eprint={2411.04997},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2411.04997}
}
microsoft/LLM2CLIP-Openai-L-14-336¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1280 | 579.0M | not specified | 2024-11-07 | eng-Latn |
Citation
@misc{huang2024llm2clippowerfullanguagemodel,
title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
year={2024},
eprint={2411.04997},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2411.04997}
}
nomic-ai/colnomic-embed-multimodal-3b¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 128.0K | 128 | 3.0B | 7.0 GB | 2025-03-31 | deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn |
Citation
@misc{nomicembedmultimodal2025,
title={Nomic Embed Multimodal: Interleaved Text, Image, and Screenshots for Visual Document Retrieval},
author={Nomic Team},
year={2025},
publisher={Nomic AI},
url={https://nomic.ai/blog/posts/nomic-embed-multimodal}
}
nomic-ai/colnomic-embed-multimodal-7b¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 128.0K | 128 | 7.0B | 14.1 GB | 2025-03-31 | deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn |
Citation
@misc{nomicembedmultimodal2025,
title={Nomic Embed Multimodal: Interleaved Text, Image, and Screenshots for Visual Document Retrieval},
author={Nomic Team},
year={2025},
publisher={Nomic AI},
url={https://nomic.ai/blog/posts/nomic-embed-multimodal}
}
nomic-ai/nomic-embed-vision-v1.5¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 2.0K | 768 | 92.9M | 355.0 MB | 2024-06-08 | eng-Latn |
Citation
@article{nussbaum2024nomicembedvision,
title={Nomic Embed Vision: Expanding the Latent Space},
author={Nussbaum, Zach and Duderstadt, Brandon and Mulyar, Andriy},
journal={arXiv preprint arXiv:2406.18587},
year={2024},
eprint={2406.18587},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2406.18587}
}
nvidia/llama-nemoretriever-colembed-1b-v1¶
License: https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 8.2K | 2048 | 2.4B | 9.0 GB | 2025-06-27 | eng-Latn |
Citation
@misc{xu2025llamanemoretrievercolembedtopperforming,
title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
year={2025},
eprint={2507.05513},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2507.05513}
}
nvidia/llama-nemoretriever-colembed-3b-v1¶
License: https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 8.2K | 3072 | 4.4B | 16.4 GB | 2025-06-27 | eng-Latn |
Citation
@misc{xu2025llamanemoretrievercolembedtopperforming,
title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
year={2025},
eprint={2507.05513},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2507.05513}
}
royokong/e5-v¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 8.2K | 4096 | 8.4B | 15.6 GB | 2024-07-17 | eng-Latn |
Citation
@article{jiang2024e5v,
title={E5-V: Universal Embeddings with Multimodal Large Language Models},
author={Jiang, Ting and Song, Minghui and Zhang, Zihan and Huang, Haizhen and Deng, Weiwei and Sun, Feng and Zhang, Qi and Wang, Deqing and Zhuang, Fuzhen},
journal={arXiv preprint arXiv:2407.12580},
year={2024},
eprint={2407.12580},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2407.12580}
}
vidore/colSmol-256M¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 8.2K | 128 | 256.0M | 800.0 MB | 2025-01-22 | eng-Latn |
Citation
@misc{faysse2024colpali,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR}
}
vidore/colSmol-500M¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 8.2K | 128 | 500.0M | 1.2 GB | 2025-01-22 | eng-Latn |
Citation
@misc{faysse2024colpali,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR}
}
vidore/colpali-v1.1¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 16.4K | 128 | 2.9B | 4.6 GB | 2024-08-21 | eng-Latn |
Citation
@misc{faysse2024colpali,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR}
}
vidore/colpali-v1.2¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 16.4K | 128 | 2.9B | 4.6 GB | 2024-08-26 | eng-Latn |
Citation
@misc{faysse2024colpali,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR}
}
vidore/colpali-v1.3¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 16.4K | 128 | 2.9B | 4.6 GB | 2024-11-01 | eng-Latn |
Citation
@misc{faysse2024colpali,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR}
}
vidore/colqwen2-v1.0¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 32.8K | 128 | 2.2B | 7.0 GB | 2025-11-03 | eng-Latn |
Citation
@misc{faysse2024colpali,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR}
}
vidore/colqwen2.5-v0.2¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 128.0K | 128 | 3.0B | 7.0 GB | 2025-01-31 | eng-Latn |
Citation
@misc{faysse2024colpali,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Faysse, Manuel and Sibille, Hugues and Wu, Tony and Omrani, Bilel and Viaud, Gautier and Hudelot, C'eline and Colombo, Pierre},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR}
}
Non-instruction Model¶
BAAI/bge-visualized-base¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 512 | 768 | 196.0M | 1.6 GB | 2024-06-06 | eng-Latn |
Citation
@article{zhou2024vista,
title={VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval},
author={Zhou, Junjie and Liu, Zheng and Xiao, Shitao and Zhao, Bo and Xiong, Yongping},
journal={arXiv preprint arXiv:2406.04292},
year={2024}
}
BAAI/bge-visualized-m3¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 8.2K | 1024 | 872.9M | 4.2 GB | 2024-06-06 | eng-Latn |
Citation
@article{zhou2024vista,
title={VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval},
author={Zhou, Junjie and Liu, Zheng and Xiao, Shitao and Zhao, Bo and Xiong, Yongping},
journal={arXiv preprint arXiv:2406.04292},
year={2024}
}
Cohere/Cohere-embed-v4.0¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 128.0K | 1536 | not specified | not specified | 2024-12-01 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111) |
Cohere/Cohere-embed-v4.0 (output_dtype=binary)¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 128.0K | 1536 | not specified | not specified | 2024-12-01 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111) |
Cohere/Cohere-embed-v4.0 (output_dtype=int8)¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 128.0K | 1536 | not specified | not specified | 2024-12-01 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111) |
QuanSun/EVA02-CLIP-B-16¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 512 | 149.0M | 568.0 MB | 2023-04-26 | eng-Latn |
Citation
@article{EVA-CLIP,
title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
journal={arXiv preprint arXiv:2303.15389},
year={2023}
}
QuanSun/EVA02-CLIP-L-14¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 768 | 428.0M | 1.6 GB | 2023-04-26 | eng-Latn |
Citation
@article{EVA-CLIP,
title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
journal={arXiv preprint arXiv:2303.15389},
year={2023}
}
QuanSun/EVA02-CLIP-bigE-14¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 1024 | 4.7B | 17.5 GB | 2023-04-26 | eng-Latn |
Citation
@article{EVA-CLIP,
title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
journal={arXiv preprint arXiv:2303.15389},
year={2023}
}
QuanSun/EVA02-CLIP-bigE-14-plus¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 1024 | 5.0B | 18.6 GB | 2023-04-26 | eng-Latn |
Citation
@article{EVA-CLIP,
title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
journal={arXiv preprint arXiv:2303.15389},
year={2023}
}
Salesforce/blip-image-captioning-base¶
License: bsd-3-clause
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 512 | 768 | 247.0M | 942.0 MB | 2023-08-01 | eng-Latn |
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
doi = {10.48550/ARXIV.2201.12086},
url = {https://arxiv.org/abs/2201.12086},
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
Salesforce/blip-image-captioning-large¶
License: bsd-3-clause
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 512 | 768 | 470.0M | 1.8 GB | 2023-12-07 | eng-Latn |
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
doi = {10.48550/ARXIV.2201.12086},
url = {https://arxiv.org/abs/2201.12086},
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
Salesforce/blip-itm-base-coco¶
License: bsd-3-clause
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 512 | 768 | 247.0M | 942.0 MB | 2023-08-01 | eng-Latn |
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
doi = {10.48550/ARXIV.2201.12086},
url = {https://arxiv.org/abs/2201.12086},
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
Salesforce/blip-itm-base-flickr¶
License: bsd-3-clause
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 512 | 768 | 247.0M | 942.0 MB | 2023-08-01 | eng-Latn |
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
doi = {10.48550/ARXIV.2201.12086},
url = {https://arxiv.org/abs/2201.12086},
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
Salesforce/blip-itm-large-coco¶
License: bsd-3-clause
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 512 | 768 | 470.0M | 1.8 GB | 2023-08-01 | eng-Latn |
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
doi = {10.48550/ARXIV.2201.12086},
url = {https://arxiv.org/abs/2201.12086},
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
Salesforce/blip-itm-large-flickr¶
License: bsd-3-clause
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 512 | 768 | 470.0M | 1.8 GB | 2023-08-01 | eng-Latn |
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
doi = {10.48550/ARXIV.2201.12086},
url = {https://arxiv.org/abs/2201.12086},
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
Salesforce/blip-vqa-base¶
License: bsd-3-clause
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 512 | 768 | 247.0M | 1.4 GB | 2023-12-07 | eng-Latn |
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
doi = {10.48550/ARXIV.2201.12086},
url = {https://arxiv.org/abs/2201.12086},
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
Salesforce/blip-vqa-capfilt-large¶
License: bsd-3-clause
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 512 | 768 | 247.0M | 942.0 MB | 2023-01-22 | eng-Latn |
Citation
@misc{https://doi.org/10.48550/arxiv.2201.12086,
doi = {10.48550/ARXIV.2201.12086},
url = {https://arxiv.org/abs/2201.12086},
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
Salesforce/blip2-opt-2.7b¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 768 | 3.7B | 14.0 GB | 2024-03-22 | eng-Latn |
Citation
@inproceedings{li2023blip2,
title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
year={2023},
booktitle={ICML},
}
Salesforce/blip2-opt-6.7b-coco¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 768 | 7.8B | 28.9 GB | 2024-03-31 | eng-Latn |
Citation
@inproceedings{li2023blip2,
title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
year={2023},
booktitle={ICML},
}
baseline/random-cross-encoder-baseline¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 32 | 0 | 0.0 MB | not specified | not specified |
baseline/random-encoder-baseline¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 32 | 0 | 0.0 MB | not specified | not specified |
cohere/embed-english-v3.0¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | not specified | not specified | 2024-10-24 | eng-Latn |
cohere/embed-multilingual-v3.0¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | not specified | not specified | 2024-10-24 | not specified |
google/siglip-base-patch16-224¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 64 | 768 | 203.0M | 775.0 MB | 2024-01-08 | eng-Latn |
Citation
@misc{zhai2023sigmoid,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
google/siglip-base-patch16-256¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 64 | 768 | 203.0M | 775.0 MB | 2024-01-08 | eng-Latn |
Citation
@misc{zhai2023sigmoid,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
google/siglip-base-patch16-256-multilingual¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 64 | 768 | 371.0M | 1.4 GB | 2024-01-08 | eng-Latn |
Citation
@misc{zhai2023sigmoid,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
google/siglip-base-patch16-384¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 64 | 768 | 203.0M | 776.0 MB | 2024-01-08 | eng-Latn |
Citation
@misc{zhai2023sigmoid,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
google/siglip-base-patch16-512¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 64 | 768 | 204.0M | 777.0 MB | 2024-01-08 | eng-Latn |
Citation
@misc{zhai2023sigmoid,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
google/siglip-large-patch16-256¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 64 | 1024 | 652.0M | 2.4 GB | 2024-01-08 | eng-Latn |
Citation
@misc{zhai2023sigmoid,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
google/siglip-large-patch16-384¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 64 | 1024 | 652.0M | 2.4 GB | 2024-01-08 | eng-Latn |
Citation
@misc{zhai2023sigmoid,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
google/siglip-so400m-patch14-224¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 16 | 1152 | 877.0M | 3.3 GB | 2024-01-08 | eng-Latn |
Citation
@misc{zhai2023sigmoid,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
google/siglip-so400m-patch14-384¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 64 | 1152 | 878.0M | 3.3 GB | 2024-01-08 | eng-Latn |
Citation
@misc{zhai2023sigmoid,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
google/siglip-so400m-patch16-256-i18n¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 64 | 1152 | 1.1B | 4.2 GB | 2024-01-08 | eng-Latn |
Citation
@misc{zhai2023sigmoid,
title={Sigmoid Loss for Language Image Pre-Training},
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
year={2023},
eprint={2303.15343},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
kakaobrain/align-base¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 64 | 768 | 176.0M | 671.0 MB | 2023-02-24 | eng-Latn |
Citation
@misc{kakaobrain2022coyo-align,
title = {COYO-ALIGN},
author = {Yoon, Boogeo and Lee, Youhan and Baek, Woonhyuk},
year = {2022},
howpublished = {https://github.com/kakaobrain/coyo-align},
}
laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 512 | 150.0M | 572.0 MB | 2023-04-26 | eng-Latn |
Citation
@inproceedings{cherti2023reproducible,
title={Reproducible scaling laws for contrastive language-image learning},
author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={2818--2829},
year={2023}
}
laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 512 | 151.0M | 576.0 MB | 2023-04-26 | eng-Latn |
Citation
@inproceedings{cherti2023reproducible,
title={Reproducible scaling laws for contrastive language-image learning},
author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={2818--2829},
year={2023}
}
laion/CLIP-ViT-B-32-laion2B-s34B-b79K¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 512 | 151.0M | 577.0 MB | 2022-09-15 | eng-Latn |
Citation
@inproceedings{cherti2023reproducible,
title={Reproducible scaling laws for contrastive language-image learning},
author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={2818--2829},
year={2023}
}
laion/CLIP-ViT-H-14-laion2B-s32B-b79K¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 1024 | 986.0M | 3.7 GB | 2022-09-15 | eng-Latn |
Citation
@inproceedings{cherti2023reproducible,
title={Reproducible scaling laws for contrastive language-image learning},
author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={2818--2829},
year={2023}
}
laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 768 | 428.0M | 1.6 GB | 2023-04-26 | eng-Latn |
Citation
@inproceedings{cherti2023reproducible,
title={Reproducible scaling laws for contrastive language-image learning},
author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={2818--2829},
year={2023}
}
laion/CLIP-ViT-L-14-laion2B-s32B-b82K¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 768 | 428.0M | 1.6 GB | 2022-09-15 | eng-Latn |
Citation
@inproceedings{cherti2023reproducible,
title={Reproducible scaling laws for contrastive language-image learning},
author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={2818--2829},
year={2023}
}
laion/CLIP-ViT-bigG-14-laion2B-39B-b160k¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 1280 | 2.5B | 9.5 GB | 2023-01-23 | eng-Latn |
Citation
@inproceedings{cherti2023reproducible,
title={Reproducible scaling laws for contrastive language-image learning},
author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={2818--2829},
year={2023}
}
laion/CLIP-ViT-g-14-laion2B-s34B-b88K¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 1024 | 1.4B | 5.1 GB | 2023-03-06 | eng-Latn |
Citation
@inproceedings{cherti2023reproducible,
title={Reproducible scaling laws for contrastive language-image learning},
author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={2818--2829},
year={2023}
}
openai/clip-vit-base-patch16¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 512 | 151.0M | 576.0 MB | 2021-02-26 | eng-Latn |
Citation
@article{radford2021learning,
title={Learning Transferable Visual Models From Natural Language Supervision},
author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
journal={arXiv preprint arXiv:2103.00020},
year={2021}
}
openai/clip-vit-base-patch32¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 512 | 151.0M | 576.0 MB | 2021-02-26 | eng-Latn |
Citation
@article{radford2021learning,
title={Learning Transferable Visual Models From Natural Language Supervision},
author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
journal={arXiv preprint arXiv:2103.00020},
year={2021}
}
openai/clip-vit-large-patch14¶
License: not specified
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 77 | 768 | 428.0M | 1.6 GB | 2021-02-26 | eng-Latn |
Citation
@article{radford2021learning,
title={Learning Transferable Visual Models From Natural Language Supervision},
author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
journal={arXiv preprint arXiv:2103.00020},
year={2021}
}
voyageai/voyage-multimodal-3¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| 32.8K | 1024 | not specified | not specified | 2024-11-10 | not specified |