Text Model¶
258 Models
Instruction Model¶
Alibaba-NLP/gte-Qwen1.5-7B-instruct¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | 32.8K | 28.8 GB | 2024-04-20 | eng-Latn |
Citation
@article{li2023towards,
title={Towards general text embeddings with multi-stage contrastive learning},
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
journal={arXiv preprint arXiv:2308.03281},
year={2023}
}
Alibaba-NLP/gte-Qwen2-1.5B-instruct¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.5B | 8960 | 32.8K | 6.6 GB | 2024-07-29 | eng-Latn |
Citation
@article{li2023towards,
title={Towards general text embeddings with multi-stage contrastive learning},
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
journal={arXiv preprint arXiv:2308.03281},
year={2023}
}
Alibaba-NLP/gte-Qwen2-7B-instruct¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 3584 | 32.8K | 28.4 GB | 2024-06-15 | not specified |
Citation
@article{li2023towards,
title={Towards general text embeddings with multi-stage contrastive learning},
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
journal={arXiv preprint arXiv:2308.03281},
year={2023}
}
BAAI/bge-base-en¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.0M | 768 | 512 | 390.0 MB | 2023-08-05 | eng-Latn |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-base-en-v1.5¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.0M | 768 | 512 | 390.0 MB | 2023-09-11 | eng-Latn |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-base-zh¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.0M | 768 | 512 | 390.0 MB | 2023-08-05 | zho-Hans |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-base-zh-v1.5¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.0M | 768 | 512 | 416.0 MB | 2023-09-11 | zho-Hans |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-large-en¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.0M | 1024 | 512 | 1.2 GB | 2023-08-05 | eng-Latn |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-large-en-v1.5¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.0M | 1024 | 512 | 1.2 GB | 2023-09-12 | eng-Latn |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-large-zh¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.0M | 1024 | 512 | 1.2 GB | 2023-08-02 | zho-Hans |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-large-zh-v1.5¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.0M | 1024 | 512 | 1.2 GB | 2023-09-12 | zho-Hans |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-small-en¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 512 | 512 | 127.0 MB | 2023-08-05 | eng-Latn |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-small-en-v1.5¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 512 | 512 | 127.0 MB | 2023-09-12 | eng-Latn |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-small-zh¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 512 | 512 | 127.0 MB | 2023-08-05 | zho-Hans |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-small-zh-v1.5¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 512 | 512 | 91.0 MB | 2023-09-12 | zho-Hans |
Citation
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BMRetriever/BMRetriever-1B¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 908.8M | 2048 | 2.0K | 3.4 GB | 2024-04-29 | eng-Latn |
Citation
@inproceedings{xu-etal-2024-bmretriever,
title = "{BMR}etriever: Tuning Large Language Models as Better Biomedical Text Retrievers",
author = "Xu, Ran and Shi, Wenqi and Yu, Yue and Zhuang, Yuchen and Zhu, Yanqiao and Wang, May Dongmei and Ho, Joyce C. and Zhang, Chao and Yang, Carl",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = "November",
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
pages = "22234--22254",
url = "https://aclanthology.org/2024.emnlp-main.1241/"
}
BMRetriever/BMRetriever-2B¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 2.5B | 2048 | 8.2K | 9.3 GB | 2024-04-29 | eng-Latn |
Citation
@inproceedings{xu-etal-2024-bmretriever,
title = "{BMR}etriever: Tuning Large Language Models as Better Biomedical Text Retrievers",
author = "Xu, Ran and Shi, Wenqi and Yu, Yue and Zhuang, Yuchen and Zhu, Yanqiao and Wang, May Dongmei and Ho, Joyce C. and Zhang, Chao and Yang, Carl",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = "November",
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
pages = "22234--22254",
url = "https://aclanthology.org/2024.emnlp-main.1241/"
}
BMRetriever/BMRetriever-410M¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 353.8M | 1024 | 2.0K | 1.3 GB | 2024-04-29 | eng-Latn |
Citation
@inproceedings{xu-etal-2024-bmretriever,
title = "{BMR}etriever: Tuning Large Language Models as Better Biomedical Text Retrievers",
author = "Xu, Ran and Shi, Wenqi and Yu, Yue and Zhuang, Yuchen and Zhu, Yanqiao and Wang, May Dongmei and Ho, Joyce C. and Zhang, Chao and Yang, Carl",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = "November",
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
pages = "22234--22254",
url = "https://aclanthology.org/2024.emnlp-main.1241/"
}
BMRetriever/BMRetriever-7B¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | 32.8K | 26.5 GB | 2024-04-29 | eng-Latn |
Citation
@inproceedings{xu-etal-2024-bmretriever,
title = "{BMR}etriever: Tuning Large Language Models as Better Biomedical Text Retrievers",
author = "Xu, Ran and Shi, Wenqi and Yu, Yue and Zhuang, Yuchen and Zhu, Yanqiao and Wang, May Dongmei and Ho, Joyce C. and Zhang, Chao and Yang, Carl",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = "November",
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
pages = "22234--22254",
url = "https://aclanthology.org/2024.emnlp-main.1241/"
}
BeastyZ/e5-R-mistral-7b¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.2B | 4096 | 32.8K | 27.0 GB | 2024-06-28 | eng-Latn |
ByteDance-Seed/Seed1.5-Embedding¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 2048 | 32.8K | not specified | 2025-04-25 | eng-Latn, zho-Hans |
Bytedance/Seed1.6-embedding-1215¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 2048 | 32.8K | not specified | 2025-12-15 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Cohere/Cohere-embed-english-light-v3.0¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 384 | 512 | not specified | 2023-11-02 | eng-Latn |
Cohere/Cohere-embed-english-v3.0¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 512 | not specified | 2023-11-02 | eng-Latn |
Cohere/Cohere-embed-multilingual-light-v3.0¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 384 | 512 | not specified | 2023-11-02 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111) |
Cohere/Cohere-embed-multilingual-v3.0¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 512 | not specified | not specified | 2023-11-02 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111) |
GeoGPT-Research-Project/GeoEmbedding¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | 32.8K | 27.0 GB | 2025-04-22 | eng-Latn |
GritLM/GritLM-7B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.2B | 4096 | 32.8K | 13.5 GB | 2024-02-15 | deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn |
Citation
@misc{muennighoff2024generative,
title={Generative Representational Instruction Tuning},
author={Niklas Muennighoff and Hongjin Su and Liang Wang and Nan Yang and Furu Wei and Tao Yu and Amanpreet Singh and Douwe Kiela},
year={2024},
eprint={2402.09906},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
GritLM/GritLM-8x7B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 57.9B | 32768 | 32.8K | 87.0 GB | 2024-02-15 | deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn |
Citation
@misc{muennighoff2024generative,
title={Generative Representational Instruction Tuning},
author={Niklas Muennighoff and Hongjin Su and Liang Wang and Nan Yang and Furu Wei and Tao Yu and Amanpreet Singh and Douwe Kiela},
year={2024},
eprint={2402.09906},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 494.0M | 896 | 512 | 1.8 GB | 2024-10-23 | eng-Latn, zho-Hans |
Citation
@misc{hu2025kalmembedding,
title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
year={2025},
eprint={2501.01028},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2501.01028},
}
HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 494.0M | 896 | 512 | 1.8 GB | 2024-12-26 | eng-Latn, zho-Hans |
Citation
@misc{hu2025kalmembedding,
title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
year={2025},
eprint={2501.01028},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2501.01028},
}
HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 494.0M | 896 | 512 | 942.0 MB | 2025-06-25 | eng-Latn, zho-Hans |
Citation
@misc{hu2025kalmembedding,
title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
year={2025},
eprint={2501.01028},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2501.01028},
}
ICT-TIME-and-Querit/BOOM_4B_v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 2560 | 32.8K | 7.5 GB | 2026-01-31 | ara-Arab, ben-Beng, deu-Latn, eng-Latn, fas-Arab, ... (18) |
IEITYuan/Yuan-embedding-2.0-en¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 595.8M | 1024 | 2.0K | 2.2 GB | 2025-11-27 | eng-Latn |
KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 494.0M | 896 | 512 | 1.8 GB | 2025-09-30 | eng-Latn, zho-Hans |
Citation
@misc{zhao2025kalmembeddingv2,
title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
author={Xinping Zhao and Xinshuo Hu and Zifei Shan and Shouzheng Huang and Yao Zhou and Xin Zhang and Zetian Sun and Zhenyu Liu and Dongfang Li and Xinyuan Wei and Youcheng Pan and Yang Xiang and Meishan Zhang and Haofen Wang and Jun Yu and Baotian Hu and Min Zhang},
year={2025},
eprint={2506.20923},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2506.20923},
}
@misc{hu2025kalmembedding,
title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
year={2025},
eprint={2501.01028},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2501.01028},
}
Kingsoft-LLM/QZhou-Embedding¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 3584 | 8.2K | 14.1 GB | 2025-08-24 | eng-Latn, zho-Hans |
Citation
@misc{yu2025qzhouembeddingtechnicalreport,
title={QZhou-Embedding Technical Report},
author={Peng Yu and En Xu and Bin Chen and Haibiao Chen and Yinfei Xu},
year={2025},
eprint={2508.21632},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2508.21632},
}
Kingsoft-LLM/QZhou-Embedding-Zh¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.6B | 1792 | 8.2K | 28.7 GB | 2025-09-28 | zho-Hans |
Citation
@misc{yu2025qzhouembeddingtechnicalreport,
title={QZhou-Embedding Technical Report},
author={Peng Yu and En Xu and Bin Chen and Haibiao Chen and Yinfei Xu},
year={2025},
eprint={2508.21632},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2508.21632},
}
Linq-AI-Research/Linq-Embed-Mistral¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | 32.8K | 13.2 GB | 2024-05-29 | eng-Latn |
Citation
@misc{LinqAIResearch2024,
title={Linq-Embed-Mistral:Elevating Text Retrieval with Improved GPT Data Through Task-Specific Control and Quality Refinement},
author={Junseong Kim and Seolhwa Lee and Jihoon Kwon and Sangmo Gu and Yejin Kim and Minkyung Cho and Jy-yong Sohn and Chanyeol Choi},
howpublished={Linq AI Research Blog},
year={2024},
url={https://getlinq.com/blog/linq-embed-mistral/}
}
ManiacLabs/miniac-embed¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 1024 | 512 | 127.0 MB | 2026-02-19 | eng-Latn |
McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 6.6B | 4096 | 32.8K | 26.5 GB | 2024-04-09 | eng-Latn |
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
year={2024},
eprint={2404.05961},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.05961},
}
McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 6.6B | 4096 | 32.8K | 26.5 GB | 2024-04-09 | eng-Latn |
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
year={2024},
eprint={2404.05961},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.05961},
}
McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.5B | 4096 | 8.2K | 28.0 GB | 2024-04-09 | eng-Latn |
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
year={2024},
eprint={2404.05961},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.05961},
}
McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.5B | 4096 | 8.2K | 28.0 GB | 2024-04-09 | eng-Latn |
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
year={2024},
eprint={2404.05961},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.05961},
}
McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.2B | 4096 | 32.8K | 26.5 GB | 2024-04-09 | eng-Latn |
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
year={2024},
eprint={2404.05961},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.05961},
}
McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.2B | 4096 | 32.8K | 26.5 GB | 2024-04-09 | eng-Latn |
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
year={2024},
eprint={2404.05961},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.05961},
}
McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.3B | 4096 | 32.8K | 26.5 GB | 2024-04-09 | eng-Latn |
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
year={2024},
eprint={2404.05961},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.05961},
}
McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.3B | 4096 | 32.8K | 26.5 GB | 2024-04-09 | eng-Latn |
Citation
@misc{behnamghader2024llm2veclargelanguagemodels,
title={LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders},
author={Parishad BehnamGhader and Vaibhav Adlakha and Marius Mosbach and Dzmitry Bahdanau and Nicolas Chapados and Siva Reddy},
year={2024},
eprint={2404.05961},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.05961},
}
MongoDB/mdbr-leaf-ir¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 768 | 512 | 86.0 MB | 2025-08-27 | eng-Latn |
Citation
@misc{mdbr_leaf,
title={LEAF: Knowledge Distillation of Text Embedding Models with Teacher-Aligned Representations},
author={Robin Vujanic and Thomas Rueckstiess},
year={2025},
eprint={2509.12539},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2509.12539}
}
MongoDB/mdbr-leaf-mt¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 1024 | 512 | 86.0 MB | 2025-08-27 | eng-Latn |
Citation
@misc{mdbr_leaf,
title={LEAF: Knowledge Distillation of Text Embedding Models with Teacher-Aligned Representations},
author={Robin Vujanic and Thomas Rueckstiess},
year={2025},
eprint={2509.12539},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2509.12539}
}
NovaSearch/jasper_en_vision_language_v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.6B | 8960 | 131.1K | 3.7 GB | 2024-12-11 | eng-Latn |
Citation
@misc{zhang2025jasperstelladistillationsota,
title={Jasper and Stella: distillation of SOTA embedding models},
author={Dun Zhang and Jiacheng Li and Ziyang Zeng and Fulong Wang},
year={2025},
eprint={2412.19048},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2412.19048},
}
NovaSearch/stella_en_1.5B_v5¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.5B | 8960 | 131.1K | 5.7 GB | 2024-07-12 | eng-Latn |
Citation
@misc{zhang2025jasperstelladistillationsota,
title={Jasper and Stella: distillation of SOTA embedding models},
author={Dun Zhang and Jiacheng Li and Ziyang Zeng and Fulong Wang},
year={2025},
eprint={2412.19048},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2412.19048},
}
NovaSearch/stella_en_400M_v5¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 435.0M | 4096 | 8.2K | 1.6 GB | 2024-07-12 | eng-Latn |
Citation
@misc{zhang2025jasperstelladistillationsota,
title={Jasper and Stella: distillation of SOTA embedding models},
author={Dun Zhang and Jiacheng Li and Ziyang Zeng and Fulong Wang},
year={2025},
eprint={2412.19048},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2412.19048},
}
Octen/Octen-Embedding-0.6B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 595.8M | 1024 | 32.8K | 1.1 GB | 2026-01-10 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{octen2025rteb,
title={Octen Series: Optimizing Embedding Models to #1 on RTEB Leaderboard},
author={Octen Team},
year={2025},
url={https://octen-team.github.io/octen_blog/posts/octen-rteb-first-place/}
}
Octen/Octen-Embedding-4B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 2560 | 32.8K | 7.5 GB | 2025-12-30 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{octen2025rteb,
title={Octen Series: Optimizing Embedding Models to #1 on RTEB Leaderboard},
author={Octen Team},
year={2025},
url={https://octen-team.github.io/octen_blog/posts/octen-rteb-first-place/}
}
Octen/Octen-Embedding-4B-INT8¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 1024 | 32.8K | 3.7 GB | 2026-04-02 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{octen2025rteb,
title={Octen Series: Optimizing Embedding Models to #1 on RTEB Leaderboard},
author={Octen Team},
year={2025},
url={https://octen-team.github.io/octen_blog/posts/octen-rteb-first-place/}
}
Octen/Octen-Embedding-8B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.6B | 4096 | 32.8K | 14.1 GB | 2025-12-23 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{octen2025rteb,
title={Octen Series: Optimizing Embedding Models to #1 on RTEB Leaderboard},
author={Octen Team},
year={2025},
url={https://octen-team.github.io/octen_blog/posts/octen-rteb-first-place/}
}
Octen/Octen-Embedding-8B-INT8¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.6B | 1024 | 32.8K | 7.0 GB | 2026-01-10 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{octen2025rteb,
title={Octen Series: Optimizing Embedding Models to #1 on RTEB Leaderboard},
author={Octen Team},
year={2025},
url={https://octen-team.github.io/octen_blog/posts/octen-rteb-first-place/}
}
Qwen/Qwen3-Embedding-0.6B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 595.8M | 1024 | 32.8K | 1.1 GB | 2025-06-05 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@article{qwen3embedding,
title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},
author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},
journal={arXiv preprint arXiv:2506.05176},
year={2025}
}
Qwen/Qwen3-Embedding-4B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 2560 | 32.8K | 7.5 GB | 2025-06-05 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@article{qwen3embedding,
title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},
author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},
journal={arXiv preprint arXiv:2506.05176},
year={2025}
}
Qwen/Qwen3-Embedding-8B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.6B | 4096 | 32.8K | 14.1 GB | 2025-06-05 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@article{qwen3embedding,
title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},
author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},
journal={arXiv preprint arXiv:2506.05176},
year={2025}
}
ReasonIR/ReasonIR-8B¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.5B | 4096 | 131.1K | not specified | 2025-04-29 | eng-Latn |
Citation
@article{shao2025reasonir,
title={ReasonIR: Training Retrievers for Reasoning Tasks},
author={Rulin Shao and Rui Qiao and Varsha Kishore and Niklas Muennighoff and Xi Victoria Lin and Daniela Rus and Bryan Kian Hsiang Low and Sewon Min and Wen-tau Yih and Pang Wei Koh and Luke Zettlemoyer},
year={2025},
journal={arXiv preprint arXiv:2504.20595},
url={https://arxiv.org/abs/2504.20595},
}
Sailesh97/Hinvec¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 939.6M | 2048 | 2.0K | 3.6 GB | 2025-06-19 | eng-Latn, hin-Deva |
Salesforce/SFR-Embedding-2_R¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | 32.8K | 13.2 GB | 2024-06-14 | eng-Latn |
Citation
@misc{SFR-embedding-2,
title={SFR-Embedding-2: Advanced Text Embedding with Multi-stage Training},
author={Rui Meng*, Ye Liu*, Shafiq Rayhan Joty, Caiming Xiong, Yingbo Zhou, Semih Yavuz},
year={2024},
url={https://huggingface.co/Salesforce/SFR-Embedding-2_R}
}
Salesforce/SFR-Embedding-Code-2B_R¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 2.6B | 2304 | 8.2K | 4.9 GB | 2025-01-17 | eng-Latn |
Citation
@article{liu2024codexembed,
title={CodeXEmbed: A Generalist Embedding Model Family for Multiligual and Multi-task Code Retrieval},
author={Liu, Ye and Meng, Rui and Jot, Shafiq and Savarese, Silvio and Xiong, Caiming and Zhou, Yingbo and Yavuz, Semih},
journal={arXiv preprint arXiv:2411.12644},
year={2024}
}
Salesforce/SFR-Embedding-Mistral¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | 32.8K | 13.2 GB | 2024-01-24 | eng-Latn |
Citation
@misc{SFRAIResearch2024,
title={SFR-Embedding-Mistral:Enhance Text Retrieval with Transfer Learning},
author={Rui Meng, Ye Liu, Shafiq Rayhan Joty, Caiming Xiong, Yingbo Zhou, Semih Yavuz},
howpublished={Salesforce AI Research Blog},
year={2024},
url={https://www.salesforce.com/blog/sfr-embedding/}
}
SamilPwC-AXNode-GenAI/PwC-Embedding_expr¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 559.9M | 1024 | 514 | 2.1 GB | 2025-08-12 | kor-Hang |
Snowflake/snowflake-arctic-embed-l¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.1M | 1024 | 512 | 1.2 GB | 2024-04-12 | eng-Latn |
Citation
@article{merrick2024embedding,
title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
author={Merrick, Luke},
journal={arXiv preprint arXiv:2407.18887},
year={2024},
eprint={2407.18887},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2407.18887}
}
Snowflake/snowflake-arctic-embed-l-v2.0¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 567.8M | 1024 | 8.2K | 2.1 GB | 2024-12-04 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (74) |
Citation
@article{yu2024arctic,
title={Arctic-Embed 2.0: Multilingual Retrieval Without Compromise},
author={Yu, Puxuan and Merrick, Luke and Nuti, Gaurav and Campos, Daniel},
journal={arXiv preprint arXiv:2412.04506},
year={2024},
eprint={2412.04506},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2412.04506}
}
Snowflake/snowflake-arctic-embed-m¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | 768 | 512 | 415.0 MB | 2024-04-12 | eng-Latn |
Citation
@article{merrick2024embedding,
title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
author={Merrick, Luke},
journal={arXiv preprint arXiv:2407.18887},
year={2024},
eprint={2407.18887},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2407.18887}
}
Snowflake/snowflake-arctic-embed-m-long¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 137.0M | 768 | 2.0K | 522.0 MB | 2024-04-12 | eng-Latn |
Citation
@article{merrick2024embedding,
title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
author={Merrick, Luke},
journal={arXiv preprint arXiv:2407.18887},
year={2024},
eprint={2407.18887},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2407.18887}
}
Snowflake/snowflake-arctic-embed-m-v1.5¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | 768 | 512 | 415.0 MB | 2024-07-08 | eng-Latn |
Citation
@article{merrick2024embedding,
title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
author={Merrick, Luke},
journal={arXiv preprint arXiv:2407.18887},
year={2024},
eprint={2407.18887},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2407.18887}
}
Snowflake/snowflake-arctic-embed-m-v2.0¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 305.0M | 768 | 8.2K | 1.1 GB | 2024-12-04 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (74) |
Citation
@article{yu2024arctic,
title={Arctic-Embed 2.0: Multilingual Retrieval Without Compromise},
author={Yu, Puxuan and Merrick, Luke and Nuti, Gaurav and Campos, Daniel},
journal={arXiv preprint arXiv:2412.04506},
year={2024},
eprint={2412.04506},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2412.04506}
}
Snowflake/snowflake-arctic-embed-s¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 384 | 512 | 127.0 MB | 2024-04-12 | eng-Latn |
Citation
@article{merrick2024embedding,
title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
author={Merrick, Luke},
journal={arXiv preprint arXiv:2407.18887},
year={2024},
eprint={2407.18887},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2407.18887}
}
Snowflake/snowflake-arctic-embed-xs¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 384 | 512 | 86.0 MB | 2024-07-08 | eng-Latn |
Citation
@article{merrick2024embedding,
title={Embedding And Clustering Your Data Can Improve Contrastive Pretraining},
author={Merrick, Luke},
journal={arXiv preprint arXiv:2407.18887},
year={2024},
eprint={2407.18887},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2407.18887}
}
Tarka-AIR/Tarka-Embedding-150M-V1¶
License: gemma • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 155.7M | 768 | 2.0K | 576.0 MB | 2025-11-04 | arb-Arab, deu-Latn, eng-Latn, fra-Latn, jpn-Jpan, ... (8) |
Citation
@misc{tarka_ai_research_2025,
author = { Tarka AI Research },
title = { Tarka-Embedding-150M-V1 (Revision c5f4f43) },
year = 2025,
url = { https://huggingface.co/Tarka-AIR/Tarka-Embedding-150M-V1 },
doi = { 10.57967/hf/6875 },
publisher = { Hugging Face }
}
Tarka-AIR/Tarka-Embedding-350M-V1¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 354.5M | 1024 | 128.0K | 676.0 MB | 2025-11-11 | arb-Arab, deu-Latn, eng-Latn, fra-Latn, jpn-Jpan, ... (8) |
Citation
@misc{tarka_ai_research_2025,
author = { Tarka AI Research },
title = { Tarka-Embedding-350M-V1 (Revision f4b5de8) },
year = 2025,
url = { https://huggingface.co/Tarka-AIR/Tarka-Embedding-350M-V1 },
doi = { 10.57967/hf/6979 },
publisher = { Hugging Face }
}
TencentBAC/Conan-embedding-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 3584 | 32.8K | not specified | 2025-04-10 | eng-Latn, zho-Hans |
VPLabs/SearchMap_Preview¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 435.0M | 4096 | 8.2K | 1.6 GB | 2025-03-05 | eng-Latn |
Citation
@misc{vectorpath2025searchmap,
title={SearchMap: Conversational E-commerce Search Embedding Model},
author={VectorPath Research Team},
year={2025},
publisher={Hugging Face},
journal={HuggingFace Model Hub},
}
WhereIsAI/UAE-Large-V1¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.1M | 1024 | 512 | 1.2 GB | 2023-12-04 | eng-Latn |
Citation
@article{li2023angle,
title={AnglE-optimized Text Embeddings},
author={Li, Xianming and Li, Jing},
journal={arXiv preprint arXiv:2309.12871},
year={2023}
}
ai-forever/FRIDA¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 823.0M | 1536 | 512 | 3.1 GB | 2024-12-29 | rus-Cyrl |
ai-forever/ru-en-RoSBERTa¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 404.0M | 1024 | 512 | 1.5 GB | 2024-07-29 | rus-Cyrl |
Citation
@misc{snegirev2024russianfocusedembeddersexplorationrumteb,
title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design},
author={Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov},
year={2024},
eprint={2408.12503},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2408.12503},
}
ai-sage/Giga-Embeddings-instruct¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 3.2B | 2048 | 4.1K | 12.6 GB | 2025-09-23 | eng-Latn, rus-Cyrl |
annamodels/LGAI-Embedding-Preview¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | 32.8K | 26.5 GB | 2025-06-11 | eng-Latn |
Citation
@misc{choi2025lgaiembeddingpreviewtechnicalreport,
title={LGAI-EMBEDDING-Preview Technical Report},
author={Jooyoung Choi and Hyun Kim and Hansol Jang and Changwook Jun and Kyunghoon Bae and Hyewon Choi and Stanley Jungkyu Choi and Honglak Lee and Chulmin Yun},
year={2025},
eprint={2506.07438},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2506.07438},
}
bedrock/cohere-embed-english-v3¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 512 | not specified | 2023-11-02 | eng-Latn |
bedrock/cohere-embed-multilingual-v3¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 512 | not specified | 2023-11-02 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (111) |
bflhc/MoD-Embedding¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 2560 | 32.8K | 7.5 GB | 2025-12-14 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{mod-embedding-2025,
title={MoD-Embedding: A Fine-tuned Multilingual Text Embedding Model},
author={MoD Team},
year={2025},
url={https://huggingface.co/bflhc/MoD-Embedding}
}
castorini/repllama-v1-7b-lora-passage¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.0B | 4096 | 4.1K | 27.0 MB | 2023-10-11 | eng-Latn |
Citation
@article{rankllama,
title={Fine-Tuning LLaMA for Multi-Stage Text Retrieval},
author={Xueguang Ma and Liang Wang and Nan Yang and Furu Wei and Jimmy Lin},
year={2023},
journal={arXiv:2310.08319},
}
cl-nagoya/ruri-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 111.2M | 768 | 512 | 212.0 MB | 2024-08-28 | jpn-Jpan |
Citation
@misc{Ruri,
title={{Ruri: Japanese General Text Embeddings}},
author={Hayato Tsukagoshi and Ryohei Sasano},
year={2024},
eprint={2409.07737},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.07737},
}
cl-nagoya/ruri-base-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 111.2M | 768 | 512 | 424.0 MB | 2024-12-05 | jpn-Jpan |
Citation
@misc{Ruri,
title={{Ruri: Japanese General Text Embeddings}},
author={Hayato Tsukagoshi and Ryohei Sasano},
year={2024},
eprint={2409.07737},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.07737},
}
cl-nagoya/ruri-large¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 337.4M | 1024 | 512 | 644.0 MB | 2024-08-28 | jpn-Jpan |
Citation
@misc{Ruri,
title={{Ruri: Japanese General Text Embeddings}},
author={Hayato Tsukagoshi and Ryohei Sasano},
year={2024},
eprint={2409.07737},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.07737},
}
cl-nagoya/ruri-large-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 337.4M | 1024 | 512 | 1.3 GB | 2024-12-06 | jpn-Jpan |
Citation
@misc{Ruri,
title={{Ruri: Japanese General Text Embeddings}},
author={Hayato Tsukagoshi and Ryohei Sasano},
year={2024},
eprint={2409.07737},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.07737},
}
cl-nagoya/ruri-small¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 68.1M | 768 | 512 | 130.0 MB | 2024-08-28 | jpn-Jpan |
Citation
@misc{Ruri,
title={{Ruri: Japanese General Text Embeddings}},
author={Hayato Tsukagoshi and Ryohei Sasano},
year={2024},
eprint={2409.07737},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.07737},
}
cl-nagoya/ruri-small-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 68.1M | 768 | 512 | 260.0 MB | 2024-12-05 | jpn-Jpan |
Citation
@misc{Ruri,
title={{Ruri: Japanese General Text Embeddings}},
author={Hayato Tsukagoshi and Ryohei Sasano},
year={2024},
eprint={2409.07737},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.07737},
}
cl-nagoya/ruri-v3-130m¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 132.1M | 512 | 8.2K | 504.0 MB | 2025-04-09 | jpn-Jpan |
Citation
@misc{Ruri,
title={{Ruri: Japanese General Text Embeddings}},
author={Hayato Tsukagoshi and Ryohei Sasano},
year={2024},
eprint={2409.07737},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.07737},
}
cl-nagoya/ruri-v3-30m¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 36.7M | 256 | 8.2K | 140.0 MB | 2025-04-07 | jpn-Jpan |
Citation
@misc{Ruri,
title={{Ruri: Japanese General Text Embeddings}},
author={Hayato Tsukagoshi and Ryohei Sasano},
year={2024},
eprint={2409.07737},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.07737},
}
cl-nagoya/ruri-v3-310m¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 314.6M | 768 | 8.2K | 1.2 GB | 2025-04-09 | jpn-Jpan |
Citation
@misc{Ruri,
title={{Ruri: Japanese General Text Embeddings}},
author={Hayato Tsukagoshi and Ryohei Sasano},
year={2024},
eprint={2409.07737},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.07737},
}
cl-nagoya/ruri-v3-70m¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 70.0M | 256 | 8.2K | 140.0 MB | 2025-04-09 | jpn-Jpan |
Citation
@misc{Ruri,
title={{Ruri: Japanese General Text Embeddings}},
author={Hayato Tsukagoshi and Ryohei Sasano},
year={2024},
eprint={2409.07737},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.07737},
}
clips/e5-base-trm-nl¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 124.4M | 768 | 514 | 237.0 MB | 2025-09-23 | nld-Latn |
Citation
@misc{banar2025mtebnle5nlembeddingbenchmark,
archiveprefix = {arXiv},
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
eprint = {2509.12340},
primaryclass = {cs.CL},
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
url = {https://arxiv.org/abs/2509.12340},
year = {2025},
}
clips/e5-large-trm-nl¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 355.0M | 1024 | 514 | 1.3 GB | 2025-09-23 | nld-Latn |
Citation
@misc{banar2025mtebnle5nlembeddingbenchmark,
archiveprefix = {arXiv},
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
eprint = {2509.12340},
primaryclass = {cs.CL},
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
url = {https://arxiv.org/abs/2509.12340},
year = {2025},
}
clips/e5-small-trm-nl¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 40.8M | 384 | 512 | 78.0 MB | 2025-09-23 | nld-Latn |
Citation
@misc{banar2025mtebnle5nlembeddingbenchmark,
archiveprefix = {arXiv},
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
eprint = {2509.12340},
primaryclass = {cs.CL},
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
url = {https://arxiv.org/abs/2509.12340},
year = {2025},
}
codefuse-ai/C2LLM-0.5B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 497.3M | 896 | 32.8K | 948.0 MB | 2025-12-22 | eng-Latn, go-Code, java-Code, javascript-Code, php-Code, ... (8) |
codefuse-ai/C2LLM-7B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.7B | 3584 | 32.8K | 14.3 GB | 2025-12-22 | eng-Latn, go-Code, java-Code, javascript-Code, php-Code, ... (8) |
codefuse-ai/F2LLM-0.6B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 596.0M | 1024 | 8.2K | 1.1 GB | 2025-09-18 | eng-Latn |
Citation
@article{2025F2LLM,
title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
journal={CoRR},
volume={abs/2510.02294},
year={2025},
url={https://doi.org/10.48550/arXiv.2510.02294},
doi={10.48550/ARXIV.2510.02294},
eprinttype={arXiv},
eprint={2510.02294}
}
codefuse-ai/F2LLM-1.7B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.7B | 2048 | 8.2K | 3.2 GB | 2025-09-18 | eng-Latn |
Citation
@article{2025F2LLM,
title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
journal={CoRR},
volume={abs/2510.02294},
year={2025},
url={https://doi.org/10.48550/arXiv.2510.02294},
doi={10.48550/ARXIV.2510.02294},
eprinttype={arXiv},
eprint={2510.02294}
}
codefuse-ai/F2LLM-4B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 2560 | 8.2K | 7.5 GB | 2025-09-18 | eng-Latn |
Citation
@article{2025F2LLM,
title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
journal={CoRR},
volume={abs/2510.02294},
year={2025},
url={https://doi.org/10.48550/arXiv.2510.02294},
doi={10.48550/ARXIV.2510.02294},
eprinttype={arXiv},
eprint={2510.02294}
}
codefuse-ai/F2LLM-v2-0.6B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 596.0M | 1024 | 41.0K | 2.2 GB | 2026-03-09 | afr-Latn, amh-Ethi, ara-Arab, azb-Arab, aze-Latn, ... (92) |
codefuse-ai/F2LLM-v2-1.7B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.7B | 2048 | 41.0K | 6.4 GB | 2026-03-09 | afr-Latn, amh-Ethi, ara-Arab, azb-Arab, aze-Latn, ... (92) |
codefuse-ai/F2LLM-v2-14B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 14.0B | 5120 | 41.0K | 52.1 GB | 2026-03-09 | afr-Latn, amh-Ethi, ara-Arab, azb-Arab, aze-Latn, ... (92) |
codefuse-ai/F2LLM-v2-160M¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 159.2M | 640 | 41.0K | 607.0 MB | 2026-03-09 | afr-Latn, amh-Ethi, ara-Arab, azb-Arab, aze-Latn, ... (92) |
codefuse-ai/F2LLM-v2-330M¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 334.3M | 896 | 41.0K | 1.2 GB | 2026-03-09 | afr-Latn, amh-Ethi, ara-Arab, azb-Arab, aze-Latn, ... (92) |
codefuse-ai/F2LLM-v2-4B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 2560 | 41.0K | 15.0 GB | 2026-03-09 | afr-Latn, amh-Ethi, ara-Arab, azb-Arab, aze-Latn, ... (92) |
codefuse-ai/F2LLM-v2-80M¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 80.1M | 320 | 41.0K | 305.0 MB | 2026-03-09 | afr-Latn, amh-Ethi, ara-Arab, azb-Arab, aze-Latn, ... (92) |
codefuse-ai/F2LLM-v2-8B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.6B | 4096 | 41.0K | 28.2 GB | 2026-03-09 | afr-Latn, amh-Ethi, ara-Arab, azb-Arab, aze-Latn, ... (92) |
deepvk/USER-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 427.0M | 768 | 512 | 473.0 MB | 2024-06-10 | rus-Cyrl |
Citation
@misc{deepvk2024user,
title={USER: Universal Sentence Encoder for Russian},
author={Malashenko, Boris and Zemerov, Anton and Spirin, Egor},
url={https://huggingface.co/datasets/deepvk/USER-base},
publisher={Hugging Face}
year={2024},
}
deepvk/USER2-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 768 | 8.2K | 568.0 MB | 2025-04-19 | rus-Cyrl |
Citation
@misc{deepvk2025user,
title={USER2},
author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
url={https://huggingface.co/deepvk/USER2-base},
publisher={Hugging Face},
year={2025},
}
deepvk/USER2-small¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 34.4M | 384 | 8.2K | 131.0 MB | 2025-04-19 | rus-Cyrl |
Citation
@misc{deepvk2025user,
title={USER2},
author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
url={https://huggingface.co/deepvk/USER2-small},
publisher={Hugging Face},
year={2025},
}
emillykkejensen/EmbeddingGemma-Scandi-300m¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 307.6M | 768 | 2.0K | 578.0 MB | 2025-10-17 | dan-Latn, nno-Latn, nob-Latn, nor-Latn, swe-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
emillykkejensen/Qwen3-Embedding-Scandi-0.6B¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 595.8M | 1024 | 32.8K | 2.2 GB | 2025-10-17 | dan-Latn, nno-Latn, nob-Latn, nor-Latn, swe-Latn |
emillykkejensen/mmBERTscandi-base-embedding¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 306.9M | 768 | 8.2K | 1.1 GB | 2025-10-17 | dan-Latn, nno-Latn, nob-Latn, nor-Latn, swe-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
fyaronskiy/english_code_retriever¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 768 | 8.2K | 568.0 MB | 2025-07-10 | eng-Latn |
google/embeddinggemma-300m¶
License: gemma • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 307.6M | 768 | 2.0K | 1.1 GB | 2025-09-04 | arb-Arab, ben-Beng, deu-Latn, eng-Latn, fin-Latn, ... (19) |
Citation
@misc{vera2025embeddinggemmapowerfullightweighttext,
title={EmbeddingGemma: Powerful and Lightweight Text Representations},
author={Henrique Schechter Vera and Sahil Dua and Biao Zhang and Daniel Salz and Ryan Mullins and Sindhu Raghuram Panyam and Sara Smoot and Iftekhar Naim and Joe Zou and Feiyang Chen and Daniel Cer and Alice Lisak and Min Choi and Lucas Gonzalez and Omar Sanseviero and Glenn Cameron and Ian Ballantyne and Kat Black and Kaifeng Chen and Weiyi Wang and Zhe Li and Gus Martins and Jinhyuk Lee and Mark Sherwood and Juyeong Ji and Renjie Wu and Jingxiao Zheng and Jyotinder Singh and Abheesht Sharma and Divyashree Sreepathihalli and Aashi Jain and Adham Elarabawy and AJ Co and Andreas Doumanoglou and Babak Samari and Ben Hora and Brian Potetz and Dahun Kim and Enrique Alfonseca and Fedor Moiseev and Feng Han and Frank Palma Gomez and Gustavo Hernández Ábrego and Hesen Zhang and Hui Hui and Jay Han and Karan Gill and Ke Chen and Koert Chen and Madhuri Shanbhogue and Michael Boratko and Paul Suganthan and Sai Meher Karthik Duddu and Sandeep Mariserla and Setareh Ariafar and Shanfeng Zhang and Shijie Zhang and Simon Baumgartner and Sonam Goenka and Steve Qiu and Tanmaya Dabral and Trevor Walker and Vikram Rao and Waleed Khawaja and Wenlei Zhou and Xiaoqi Ren and Ye Xia and Yichang Chen and Yi-Ting Chen and Zhe Dong and Zhongli Ding and Francesco Visin and Gaël Liu and Jiageng Zhang and Kathleen Kenealy and Michelle Casbon and Ravin Kumar and Thomas Mesnard and Zach Gleicher and Cormac Brick and Olivier Lacombe and Adam Roberts and Qin Yin and Yunhsuan Sung and Raphael Hoffmann and Tris Warkentin and Armand Joulin and Tom Duerig and Mojtaba Seyedhosseini},
year={2025},
eprint={2509.20354},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2509.20354},
}
google/gemini-embedding-001¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 3072 | 2.0K | not specified | 2025-03-07 | arb-Arab, ben-Beng, deu-Latn, eng-Latn, fin-Latn, ... (19) |
google/text-embedding-004¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 768 | 2.0K | not specified | 2024-05-14 | eng-Latn |
google/text-embedding-005¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 768 | 2.0K | not specified | 2024-11-18 | eng-Latn |
google/text-multilingual-embedding-002¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 768 | 2.0K | not specified | 2024-05-14 | arb-Arab, ben-Beng, deu-Latn, eng-Latn, fin-Latn, ... (19) |
infgrad/Jasper-Token-Compression-600M¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 607.3M | 2048 | 32.8K | 2.2 GB | 2025-11-14 | eng-Latn, zho-Hans |
Citation
@misc{zhang2025jaspertokencompression600mtechnicalreport,
title={Jasper-Token-Compression-600M Technical Report},
author={Dun Zhang and Ziyang Zeng and Yudong Zhou and Shuyang Lu},
year={2025},
eprint={2511.14405},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2511.14405},
}
infly/inf-retriever-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 3584 | 32.8K | 13.2 GB | 2024-12-24 | eng-Latn, zho-Hans |
Citation
@misc{infly-ai_2025,
author = {Junhan Yang and Jiahe Wan and Yichen Yao and Wei Chu and Yinghui Xu and Yuan Qi},
title = {inf-retriever-v1 (Revision 5f469d7)},
year = 2025,
url = {https://huggingface.co/infly/inf-retriever-v1},
doi = {10.57967/hf/4262},
publisher = {Hugging Face}
}
infly/inf-retriever-v1-1.5b¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.5B | 1536 | 32.8K | 2.9 GB | 2025-02-08 | eng-Latn, zho-Hans |
Citation
@misc{infly-ai_2025,
author = {Junhan Yang and Jiahe Wan and Yichen Yao and Wei Chu and Yinghui Xu and Yuan Qi},
title = {inf-retriever-v1 (Revision 5f469d7)},
year = 2025,
url = {https://huggingface.co/infly/inf-retriever-v1},
doi = {10.57967/hf/4262},
publisher = {Hugging Face}
}
intfloat/e5-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.0M | 768 | 512 | 418.0 MB | 2022-12-26 | eng-Latn |
Citation
@article{wang2022text,
title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2212.03533},
year={2022}
}
intfloat/e5-base-v2¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.0M | 768 | 512 | 418.0 MB | 2024-02-08 | eng-Latn |
Citation
@article{wang2022text,
title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2212.03533},
year={2022}
}
intfloat/e5-large¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.0M | 1024 | 512 | 1.2 GB | 2022-12-26 | eng-Latn |
Citation
@article{wang2022text,
title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2212.03533},
year={2022}
}
intfloat/e5-large-v2¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.0M | 1024 | 514 | 1.2 GB | 2024-02-08 | eng-Latn |
Citation
@article{wang2022text,
title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2212.03533},
year={2022}
}
intfloat/e5-mistral-7b-instruct¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | 32.8K | 13.2 GB | 2024-02-08 | deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn |
Citation
@article{wang2023improving,
title={Improving Text Embeddings with Large Language Models},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2401.00368},
year={2023}
}
@article{wang2022text,
title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2212.03533},
year={2022}
}
intfloat/e5-small¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.0M | 384 | 512 | 127.0 MB | 2024-02-08 | eng-Latn |
Citation
@article{wang2022text,
title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2212.03533},
year={2022}
}
intfloat/e5-small-v2¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.0M | 384 | 512 | 127.0 MB | 2024-02-08 | eng-Latn |
Citation
@article{wang2022text,
title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2212.03533},
year={2022}
}
intfloat/multilingual-e5-base¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 514 | 1.0 GB | 2024-02-08 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
Citation
@article{wang2024multilingual,
title={Multilingual E5 Text Embeddings: A Technical Report},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2402.05672},
year={2024}
}
intfloat/multilingual-e5-large¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 560.0M | 1024 | 514 | 2.1 GB | 2024-02-08 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
Citation
@article{wang2024multilingual,
title={Multilingual E5 Text Embeddings: A Technical Report},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2402.05672},
year={2024}
}
intfloat/multilingual-e5-large-instruct¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 560.0M | 1024 | 514 | 1.0 GB | 2024-02-08 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
Citation
@article{wang2024multilingual,
title={Multilingual E5 Text Embeddings: A Technical Report},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2402.05672},
year={2024}
}
intfloat/multilingual-e5-small¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 118.0M | 384 | 512 | 449.0 MB | 2024-02-08 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
Citation
@article{wang2024multilingual,
title={Multilingual E5 Text Embeddings: A Technical Report},
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
journal={arXiv preprint arXiv:2402.05672},
year={2024}
}
jinaai/jina-embeddings-v3¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 572.3M | 1024 | 8.2K | 1.1 GB | 2024-09-18 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
Citation
@misc{sturua2024jinaembeddingsv3multilingualembeddingstask,
title={jina-embeddings-v3: Multilingual Embeddings With Task LoRA},
author={Saba Sturua and Isabelle Mohr and Mohammad Kalim Akram and Michael Günther and Bo Wang and Markus Krimmel and Feng Wang and Georgios Mastrapas and Andreas Koukounas and Andreas Koukounas and Nan Wang and Han Xiao},
year={2024},
eprint={2409.10173},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.10173},
}
jinaai/jina-embeddings-v5-text-nano¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 211.8M | [32, 64, 128, 256, 512, 768] | 8.2K | 404.0 MB | 2026-02-17 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{akram2026jinaembeddingsv5texttasktargetedembeddingdistillation,
title={jina-embeddings-v5-text: Task-Targeted Embedding Distillation},
author={Mohammad Kalim Akram and Saba Sturua and Nastia Havriushenko and Quentin Herreros and Michael Günther and Maximilian Werk and Han Xiao},
year={2026},
eprint={2602.15547},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2602.15547},
}
jinaai/jina-embeddings-v5-text-small¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 596.0M | [32, 64, 128, 256, 512, 768, 1024] | 32.8K | 1.1 GB | 2026-02-17 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{akram2026jinaembeddingsv5texttasktargetedembeddingdistillation,
title={jina-embeddings-v5-text: Task-Targeted Embedding Distillation},
author={Mohammad Kalim Akram and Saba Sturua and Nastia Havriushenko and Quentin Herreros and Michael Günther and Maximilian Werk and Han Xiao},
year={2026},
eprint={2602.15547},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2602.15547},
}
jxm/cde-small-v1¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 281.0M | 768 | 512 | 1.0 GB | 2024-09-24 | eng-Latn |
Citation
@misc{morris2024contextualdocumentembeddings,
title={Contextual Document Embeddings},
author={John X. Morris and Alexander M. Rush},
year={2024},
eprint={2410.02525},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2410.02525},
}
jxm/cde-small-v2¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 306.0M | 768 | 512 | 1.1 GB | 2025-01-13 | eng-Latn |
Citation
@misc{morris2024contextualdocumentembeddings,
title={Contextual Document Embeddings},
author={John X. Morris and Alexander M. Rush},
year={2024},
eprint={2410.02525},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2410.02525},
}
llamaindex/vdr-2b-multi-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 2.2B | 1536 | 32.8K | 4.1 GB | 2024-01-08 | deu-Latn, eng-Latn, fra-Latn, ita-Latn, spa-Latn |
manveertamber/cadet-embed-base-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.0M | 768 | 512 | 418.0 MB | 2025-05-11 | eng-Latn |
Citation
@article{tamber2025conventionalcontrastivelearningfalls,
title={Conventional Contrastive Learning Often Falls Short: Improving Dense Retrieval with Cross-Encoder Listwise Distillation and Synthetic Data},
author={Manveer Singh Tamber and Suleman Kazi and Vivek Sourabh and Jimmy Lin},
journal={arXiv:2505.19274},
year={2025}
}
microsoft/harrier-oss-v1-0.6b¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 596.0M | 1024 | 32.8K | 1.1 GB | 2026-03-27 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
microsoft/harrier-oss-v1-270m¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 268.1M | 640 | 32.8K | 512.0 MB | 2026-03-27 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
microsoft/harrier-oss-v1-27b¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 27.0B | 5376 | 131.1K | 50.3 GB | 2026-03-27 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
mixedbread-ai/mxbai-embed-2d-large-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.1M | 768 | 512 | not specified | 2024-03-04 | eng-Latn |
mixedbread-ai/mxbai-embed-large-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.1M | 1024 | 512 | 639.0 MB | 2024-03-07 | eng-Latn |
Citation
@online{emb2024mxbai,
title={Open Source Strikes Bread - New Fluffy Embeddings Model},
author={Sean Lee and Aamir Shakir and Darius Koenig and Julius Lipp},
year={2024},
url={https://www.mixedbread.ai/blog/mxbai-embed-large-v1},
}
@article{li2023angle,
title={AnglE-optimized Text Embeddings},
author={Li, Xianming and Li, Jing},
journal={arXiv preprint arXiv:2309.12871},
year={2023}
}
mixedbread-ai/mxbai-embed-xsmall-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 24.1M | 384 | 512 | not specified | 2024-08-13 | eng-Latn |
Citation
@online{xsmall2024mxbai,
title={Every Byte Matters: Introducing mxbai-embed-xsmall-v1},
author={Sean Lee and Julius Lipp and Rui Huang and Darius Koenig},
year={2024},
url={https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1},
}
nomic-ai/modernbert-embed-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 768 | 8.2K | 568.0 MB | 2024-12-29 | eng-Latn |
Citation
@misc{nussbaum2024nomic,
title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
year={2024},
eprint={2402.01613},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
nomic-ai/nomic-embed-code¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 3584 | 32.8K | 26.3 GB | 2025-03-24 | eng-Latn |
Citation
@misc{suresh2025cornstackhighqualitycontrastivedata,
title={CoRNStack: High-Quality Contrastive Data for Better Code Retrieval and Reranking},
author={Tarun Suresh and Revanth Gangi Reddy and Yifei Xu and Zach Nussbaum and Andriy Mulyar and Brandon Duderstadt and Heng Ji},
year={2025},
eprint={2412.01007},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2412.01007},
}
nomic-ai/nomic-embed-text-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 136.7M | 768 | 8.2K | 522.0 MB | 2024-01-31 | eng-Latn |
Citation
@misc{nussbaum2024nomic,
title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
year={2024},
eprint={2402.01613},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
nomic-ai/nomic-embed-text-v1-ablated¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 136.7M | 768 | 8.2K | not specified | 2024-01-15 | eng-Latn |
nomic-ai/nomic-embed-text-v1-unsupervised¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 136.7M | 768 | 8.2K | not specified | 2024-01-15 | eng-Latn |
nomic-ai/nomic-embed-text-v1.5¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 136.7M | 768 | 8.2K | 522.0 MB | 2024-02-10 | eng-Latn |
Citation
@misc{nussbaum2024nomic,
title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
year={2024},
eprint={2402.01613},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
nomic-ai/nomic-embed-text-v2-moe¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 475.3M | 768 | 512 | 1.8 GB | 2025-02-07 | amh-Ethi, arb-Arab, bel-Cyrl, ben-Beng, bul-Cyrl, ... (98) |
Citation
@misc{nussbaum2025trainingsparsemixtureexperts,
title={Training Sparse Mixture Of Experts Text Embedding Models},
author={Zach Nussbaum and Brandon Duderstadt},
year={2025},
eprint={2502.07972},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2502.07972},
}
nvidia/NV-Embed-v1¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.8B | 4096 | 32.8K | 14.6 GB | 2024-09-13 | eng-Latn |
Citation
@misc{lee2025nvembedimprovedtechniquestraining,
title={NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding Models},
author={Chankyu Lee and Rajarshi Roy and Mengyao Xu and Jonathan Raiman and Mohammad Shoeybi and Bryan Catanzaro and Wei Ping},
year={2025},
eprint={2405.17428},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.17428},
}
nvidia/NV-Embed-v2¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.8B | 4096 | 32.8K | 14.6 GB | 2024-09-09 | eng-Latn |
Citation
@misc{lee2025nvembedimprovedtechniquestraining,
title={NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding Models},
author={Chankyu Lee and Rajarshi Roy and Mengyao Xu and Jonathan Raiman and Mohammad Shoeybi and Bryan Catanzaro and Wei Ping},
year={2025},
eprint={2405.17428},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.17428},
}
nvidia/llama-embed-nemotron-8b¶
License: https://huggingface.co/nvidia/llama-embed-nemotron-8b/blob/main/LICENSE • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.5B | 4096 | 32.8K | 28.0 GB | 2025-10-23 | afr-Latn, amh-Ethi, ara-Arab, arq-Arab, ary-Arab, ... (66) |
Citation
@misc{babakhin2025llamaembednemotron8buniversaltextembedding,
title={Llama-Embed-Nemotron-8B: A Universal Text Embedding Model for Multilingual and Cross-Lingual Tasks},
author={Yauhen Babakhin and Radek Osmulski and Ronay Ak and Gabriel Moreira and Mengyao Xu and Benedikt Schifferer and Bo Liu and Even Oldridge},
year={2025},
eprint={2511.07025},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2511.07025},
}
opensearch-project/opensearch-neural-sparse-encoding-doc-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 133.0M | 30522 | 512 | 507.0 MB | 2024-03-07 | eng-Latn |
opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 66.4M | 30522 | 512 | 267.0 MB | 2024-07-17 | eng-Latn |
opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 30522 | 512 | 86.0 MB | 2024-07-18 | eng-Latn |
opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 66.4M | 30522 | 512 | 267.0 MB | 2025-03-28 | eng-Latn |
opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 136.8M | 30522 | 8.2K | 549.0 MB | 2025-06-18 | eng-Latn |
qihoo360/Zhinao-ChineseModernBert-Embedding¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 227.0M | 768 | 512 | 866.0 MB | 2026-03-18 | zho-Hans |
samaya-ai/RepLLaMA-reproduced¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.0B | 4096 | 4.1K | 27.0 MB | 2024-09-15 | eng-Latn |
Citation
@article{rankllama,
title={Fine-Tuning LLaMA for Multi-Stage Text Retrieval},
author={Xueguang Ma and Liang Wang and Nan Yang and Furu Wei and Jimmy Lin},
year={2023},
journal={arXiv:2310.08319},
}
samaya-ai/promptriever-llama2-7b-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.0B | 4096 | 4.1K | 26.1 GB | 2024-09-15 | eng-Latn |
Citation
@article{weller2024promptriever,
title={Promptriever: Instruction-Trained Retrievers Can Be Prompted Like Language Models},
author={Orion Weller and Benjamin Van Durme and Dawn Lawrie and Ashwin Paranjape and Yuhao Zhang and Jack Hessel},
year={2024},
eprint={2409.11136},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2409.11136},
}
samaya-ai/promptriever-llama3.1-8b-instruct-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 8.0B | 4096 | 8.2K | 29.8 GB | 2024-09-15 | eng-Latn |
Citation
@article{weller2024promptriever,
title={Promptriever: Instruction-Trained Retrievers Can Be Prompted Like Language Models},
author={Orion Weller and Benjamin Van Durme and Dawn Lawrie and Ashwin Paranjape and Yuhao Zhang and Jack Hessel},
year={2024},
eprint={2409.11136},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2409.11136},
}
samaya-ai/promptriever-llama3.1-8b-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 8.0B | 4096 | 8.2K | 29.8 GB | 2024-09-15 | eng-Latn |
Citation
@article{weller2024promptriever,
title={Promptriever: Instruction-Trained Retrievers Can Be Prompted Like Language Models},
author={Orion Weller and Benjamin Van Durme and Dawn Lawrie and Ashwin Paranjape and Yuhao Zhang and Jack Hessel},
year={2024},
eprint={2409.11136},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2409.11136},
}
samaya-ai/promptriever-mistral-v0.1-7b-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.0B | 4096 | 4.1K | 26.1 GB | 2024-09-15 | eng-Latn |
Citation
@article{weller2024promptriever,
title={Promptriever: Instruction-Trained Retrievers Can Be Prompted Like Language Models},
author={Orion Weller and Benjamin Van Durme and Dawn Lawrie and Ashwin Paranjape and Yuhao Zhang and Jack Hessel},
year={2024},
eprint={2409.11136},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2409.11136},
}
sbintuitions/sarashina-embedding-v2-1b¶
License: https://huggingface.co/sbintuitions/sarashina-embedding-v2-1b/blob/main/LICENSE • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.2B | 1792 | 8.2K | 4.6 GB | 2025-07-30 | jpn-Jpan |
sergeyzh/BERTA¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 128.0M | 768 | 512 | 489.0 MB | 2025-03-10 | rus-Cyrl |
sergeyzh/rubert-mini-frida¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 32.3M | 312 | 2.0K | 123.0 MB | 2025-03-02 | rus-Cyrl |
telepix/PIXIE-Rune-v1.0¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 567.8M | 1024 | 6.1K | 2.1 GB | 2026-01-15 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (74) |
Citation
@misc{TelePIX-PIXIE-Rune-v1.0,
title = {PIXIE-Rune-v1.0},
author = {TelePIX AI Research Team and Bongmin Kim},
year = {2026},
howpublished = {Hugging Face model card},
url = {https://huggingface.co/telepix/PIXIE-Rune-v1.0}
}
@article{yu2024arctic,
title={Arctic-Embed 2.0: Multilingual Retrieval Without Compromise},
author={Yu, Puxuan and Merrick, Luke and Nuti, Gaurav and Campos, Daniel},
journal={arXiv preprint arXiv:2412.04506},
year={2024},
eprint={2412.04506},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2412.04506}
}
tencent/KaLM-Embedding-Gemma3-12B-2511¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 11.8B | 3840 | 32.8K | 43.8 GB | 2025-11-06 | not specified |
Citation
@misc{zhao2025kalmembeddingv2,
title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
author={Xinping Zhao and Xinshuo Hu and Zifei Shan and Shouzheng Huang and Yao Zhou and Xin Zhang and Zetian Sun and Zhenyu Liu and Dongfang Li and Xinyuan Wei and Youcheng Pan and Yang Xiang and Meishan Zhang and Haofen Wang and Jun Yu and Baotian Hu and Min Zhang},
year={2025},
eprint={2506.20923},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2506.20923},
}
@misc{hu2025kalmembedding,
title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
year={2025},
eprint={2501.01028},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2501.01028},
}
tencent/Youtu-Embedding¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 2.7B | 2048 | 8.2K | not specified | 2025-09-28 | zho-Hans |
Citation
@misc{zhang2025codiemb,
title={CoDiEmb: A Collaborative yet Distinct Framework for Unified Representation Learning in Information Retrieval and Semantic Textual Similarity},
author={Zhang, Bowen and Song, Zixin and Chen, Chunquan and Zhang, Qian-Wen and Yin, Di and Sun, Xing},
year={2025},
eprint={2508.11442},
archivePrefix={arXiv},
url={https://arxiv.org/abs/2508.11442},
}
voyageai/voyage-2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 4.0K | not specified | 2023-10-29 | not specified |
voyageai/voyage-3¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2024-09-18 | not specified |
voyageai/voyage-3-large¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2025-01-07 | not specified |
voyageai/voyage-3-lite¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 512 | 32.0K | not specified | 2024-09-18 | not specified |
voyageai/voyage-3-m-exp¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 6.9B | 2048 | 32.0K | not specified | 2025-01-08 | eng-Latn |
voyageai/voyage-3.5¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2025-01-21 | not specified |
voyageai/voyage-3.5 (output_dtype=binary)¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2025-01-21 | not specified |
voyageai/voyage-3.5 (output_dtype=int8)¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2025-01-21 | not specified |
voyageai/voyage-4¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2026-01-15 | not specified |
voyageai/voyage-4-large¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2026-01-15 | not specified |
voyageai/voyage-4-large (embed_dim=2048)¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 2048 | 32.0K | not specified | 2026-01-15 | not specified |
voyageai/voyage-4-lite¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2026-01-15 | not specified |
voyageai/voyage-4-nano¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 346.5M | 2048 | 32.0K | 661.0 MB | 2026-01-15 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
voyageai/voyage-code-2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1536 | 16.0K | not specified | 2024-01-23 | not specified |
voyageai/voyage-code-3¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2024-12-04 | not specified |
voyageai/voyage-finance-2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2024-05-30 | not specified |
voyageai/voyage-large-2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1536 | 16.0K | not specified | 2023-10-29 | not specified |
voyageai/voyage-large-2-instruct¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 16.0K | not specified | 2024-05-05 | not specified |
voyageai/voyage-law-2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 16.0K | not specified | 2024-04-15 | not specified |
voyageai/voyage-multilingual-2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 32.0K | not specified | 2024-06-10 | not specified |
yibinlei/LENS-d4000¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4000 | 32.8K | 26.5 GB | 2025-01-17 | not specified |
Citation
@article{lei2025lens,
title={Enhancing Lexicon-Based Text Embeddings with Large Language Models},
author={Lei, Yibin and Shen, Tao and Cao, Yu and Yates, Andrew},
journal={arXiv preprint arXiv:2501.09749},
year={2025}
}
yibinlei/LENS-d8000¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 8000 | 32.8K | 26.5 GB | 2025-01-17 | not specified |
Citation
@article{lei2025lens,
title={Enhancing Lexicon-Based Text Embeddings with Large Language Models},
author={Lei, Yibin and Shen, Tao and Cao, Yu and Yates, Andrew},
journal={arXiv preprint arXiv:2501.09749},
year={2025}
}
zeta-alpha-ai/Zeta-Alpha-E5-Mistral¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | 32.8K | 13.2 GB | 2024-08-30 | eng-Latn |
Non-instruction Model¶
AITeamVN/Vietnamese_Embedding¶
License: cc-by-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 567.8M | 1024 | 8.2K | 2.1 GB | 2024-03-17 | vie-Latn |
Citation
@misc{Vietnamese_Embedding,
title={Vietnamese_Embedding: Embedding model in Vietnamese language.},
author={Nguyen Nho Trung, Nguyen Nhat Quang, Nguyen Van Huy},
year={2025},
publisher={Huggingface},
}
Alibaba-NLP/gte-base-en-v1.5¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 136.8M | 768 | 8.2K | not specified | 2024-06-20 | eng-Latn |
Citation
@misc{zhang2024mgte,
title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
author={Xin Zhang and Yanzhao Zhang and Dingkun Long and Wen Xie and Ziqi Dai and Jialong Tang and Huan Lin and Baosong Yang and Pengjun Xie and Fei Huang and Meishan Zhang and Wenjie Li and Min Zhang},
year={2024},
eprint={2407.19669},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2407.19669},
}
@misc{li2023towards,
title={Towards General Text Embeddings with Multi-stage Contrastive Learning},
author={Zehan Li and Xin Zhang and Yanzhao Zhang and Dingkun Long and Pengjun Xie and Meishan Zhang},
year={2023},
eprint={2308.03281},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2308.03281},
}
Alibaba-NLP/gte-modernbert-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 768 | 8.2K | 284.0 MB | 2025-01-21 | eng-Latn |
Citation
@inproceedings{zhang2024mgte,
title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},
booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},
pages={1393--1412},
year={2024}
}
@article{li2023towards,
title={Towards general text embeddings with multi-stage contrastive learning},
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
journal={arXiv preprint arXiv:2308.03281},
year={2023}
}
Alibaba-NLP/gte-multilingual-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 305.4M | 768 | 8.2K | 582.0 MB | 2024-07-20 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@inproceedings{zhang2024mgte,
title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},
booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},
pages={1393--1412},
year={2024}
}
BAAI/bge-en-icl¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | 32.8K | 26.5 GB | 2024-07-25 | eng-Latn |
Citation
@misc{li2024makingtextembeddersfewshot,
title={Making Text Embedders Few-Shot Learners},
author={Chaofan Li and MingHao Qin and Shitao Xiao and Jianlyu Chen and Kun Luo and Yingxia Shao and Defu Lian and Zheng Liu},
year={2024},
eprint={2409.15700},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2409.15700},
}
BAAI/bge-m3¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 568.0M | 1024 | 8.2K | 2.1 GB | 2024-06-28 | afr-Latn, amh-Ethi, ast-Latn, azj-Latn, azj-Latn, ... (29) |
Citation
@misc{bge-m3,
title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
year={2024},
eprint={2402.03216},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
BAAI/bge-m3-unsupervised¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 568.0M | 1024 | 8.2K | 2.1 GB | 2024-01-30 | afr-Latn, amh-Ethi, ast-Latn, azj-Latn, azj-Latn, ... (29) |
Citation
@misc{bge-m3,
title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
year={2024},
eprint={2402.03216},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
BAAI/bge-multilingual-gemma2¶
License: https://ai.google.dev/gemma/terms • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 9.2B | 3584 | 8.2K | 34.4 GB | 2024-07-25 | eng-Latn, fra-Latn, jpn-Jpan, jpn-Latn, kor-Hang, ... (7) |
Citation
@misc{bge-m3,
title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
year={2024},
eprint={2402.03216},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{xiao2024cpackpackagedresourcesadvance,
archiveprefix = {arXiv},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
eprint = {2309.07597},
primaryclass = {cs.CL},
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
url = {https://arxiv.org/abs/2309.07597},
year = {2024},
}
BAAI/bge-reranker-v2-m3¶
License: not specified
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | 2.1 GB | 2024-06-24 | ara-Arab, ben-Beng, dan-Latn, deu-Latn, eng-Latn, ... (32) |
Citation
@misc{li2023making,
title={Making Large Language Models A Better Foundation For Dense Retrieval},
author={Chaofan Li and Zheng Liu and Shitao Xiao and Yingxia Shao},
year={2023},
eprint={2312.15503},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{bge-m3,
archiveprefix = {arXiv},
author = {Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
eprint = {2402.03216},
primaryclass = {cs.CL},
title = {BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
year = {2024},
}
ByteDance/ListConRanker¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 401.0M | 1024 | 512 | 1.2 GB | 2024-12-11 | zho-Hans |
Citation
@article{liu2025listconranker,
title={ListConRanker: A Contrastive Text Reranker with Listwise Encoding},
author={Liu, Junlong and Ma, Yue and Zhao, Ruihui and Zheng, Junhao and Ma, Qianli and Kang, Yangyang},
journal={arXiv preprint arXiv:2501.07111},
year={2025}
}
Classical/Yinka¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 326.0M | 1024 | 512 | 1.2 GB | 2024-01-09 | zho-Hans |
DMetaSoul/Dmeta-embedding-zh-small¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 74.2M | 768 | 1.0K | 283.0 MB | 2024-03-25 | zho-Hans |
DMetaSoul/sbert-chinese-general-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 102.3M | 128 | 512 | not specified | 2022-03-25 | zho-Hans |
DeepPavlov/distilrubert-small-cased-conversational¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 107.0M | 768 | 512 | 408.0 MB | 2022-06-28 | rus-Cyrl |
Citation
@misc{https://doi.org/10.48550/arxiv.2205.02340,
doi = {10.48550/ARXIV.2205.02340},
url = {https://arxiv.org/abs/2205.02340},
author = {Kolesnikova, Alina and Kuratov, Yuri and Konovalov, Vasily and Burtsev, Mikhail},
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Knowledge Distillation of Russian Language Models with Reduction of Vocabulary},
publisher = {arXiv},
year = {2022},
copyright = {arXiv.org perpetual, non-exclusive license}
}
DeepPavlov/rubert-base-cased¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.3B | 768 | 512 | 4.8 GB | 2020-03-04 | rus-Cyrl |
Citation
@misc{kuratov2019adaptationdeepbidirectionalmultilingual,
title={Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language},
author={Yuri Kuratov and Mikhail Arkhipov},
year={2019},
eprint={1905.07213},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1905.07213},
}
DeepPavlov/rubert-base-cased-sentence¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 107.0M | 768 | 512 | 408.0 MB | 2020-03-04 | rus-Cyrl |
FacebookAI/xlm-roberta-base¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 512 | 1.0 GB | 2019-11-05 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
Citation
@article{DBLP:journals/corr/abs-1911-02116,
author = {Alexis Conneau and
Kartikay Khandelwal and
Naman Goyal and
Vishrav Chaudhary and
Guillaume Wenzek and
Francisco Guzm{'{a}}n and
Edouard Grave and
Myle Ott and
Luke Zettlemoyer and
Veselin Stoyanov},
title = {Unsupervised Cross-lingual Representation Learning at Scale},
journal = {CoRR},
volume = {abs/1911.02116},
year = {2019},
url = {http://arxiv.org/abs/1911.02116},
eprinttype = {arXiv},
eprint = {1911.02116},
timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
FacebookAI/xlm-roberta-large¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 559.9M | 1024 | 512 | 2.1 GB | 2019-11-05 | afr-Latn, amh-Latn, ara-Latn, asm-Latn, aze-Latn, ... (99) |
Citation
@article{DBLP:journals/corr/abs-1911-02116,
author = {Alexis Conneau and
Kartikay Khandelwal and
Naman Goyal and
Vishrav Chaudhary and
Guillaume Wenzek and
Francisco Guzm{'{a}}n and
Edouard Grave and
Myle Ott and
Luke Zettlemoyer and
Veselin Stoyanov},
title = {Unsupervised Cross-lingual Representation Learning at Scale},
journal = {CoRR},
volume = {abs/1911.02116},
year = {2019},
url = {http://arxiv.org/abs/1911.02116},
eprinttype = {arXiv},
eprint = {1911.02116},
timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Gameselo/STS-multilingual-mpnet-base-v2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 514 | 1.0 GB | 2024-06-07 | not specified |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
GreenNode/GreenNode-Embedding-E5-Large-VN-V1¶
License: cc-by-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 560.0M | 1024 | 512 | 2.1 GB | 2026-02-26 | vie-Latn |
GreenNode/GreenNode-Embedding-KaLM-Mini-Instruct-VN-V1¶
License: cc-by-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 494.0M | 896 | 32.8K | 1.8 GB | 2026-02-26 | vie-Latn |
GreenNode/GreenNode-Embedding-Large-VN-Mixed-V1¶
License: cc-by-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 567.8M | 1024 | 8.2K | 2.1 GB | 2024-04-11 | vie-Latn |
GreenNode/GreenNode-Embedding-Large-VN-V1¶
License: cc-by-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 567.8M | 1024 | 8.2K | 2.1 GB | 2024-04-11 | vie-Latn |
HIT-TMG/KaLM-embedding-multilingual-mini-v1¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 494.0M | 896 | 512 | 1.8 GB | 2024-08-27 | eng-Latn, zho-Hans |
Citation
@misc{hu2025kalmembedding,
title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
year={2025},
eprint={2501.01028},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2501.01028},
}
Haon-Chen/speed-embedding-7b-instruct¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | not specified | 32.8K | 13.2 GB | 2024-10-31 | eng-Latn |
Citation
@article{chen2024little,
title={Little Giants: Synthesizing High-Quality Embedding Data at Scale},
author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
journal={arXiv preprint arXiv:2410.18634},
year={2024}
}
HooshvareLab/bert-base-parsbert-uncased¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 162.8M | 768 | 512 | 621.0 MB | 2021-05-19 | fas-Arab |
Citation
@article{ParsBERT,
title={ParsBERT: Transformer-based Model for Persian Language Understanding},
author={Mehrdad Farahani, Mohammad Gharachorloo, Marzieh Farahani, Mohammad Manthouri},
journal={ArXiv},
year={2020},
volume={abs/2005.12515}
}
Hum-Works/lodestone-base-4096-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 137.4M | 768 | not specified | not specified | 2023-08-25 | eng-Latn |
IEITYuan/Yuan-embedding-2.0-zh¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 327.4M | 1792 | 512 | 1.2 GB | 2025-11-24 | zho-Hans |
Jaume/gemma-2b-embeddings¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 2.5B | 2048 | 8.2K | 9.3 GB | 2024-06-29 | not specified |
KBLab/sentence-bert-swedish-cased¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 124.7M | 768 | 384 | 476.0 MB | 2023-01-11 | swe-Latn |
Citation
@misc{rekathati2021introducing,
author = {Rekathati, Faton},
title = {The KBLab Blog: Introducing a Swedish Sentence Transformer},
url = {https://kb-labb.github.io/posts/2021-08-23-a-swedish-sentence-transformer/},
year = {2021}
}
KFST/XLMRoberta-en-da-sv-nb¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 512 | 1.0 GB | 2022-02-22 | dan-Latn, eng-Latn, nno-Latn, nob-Latn, swe-Latn |
KennethEnevoldsen/dfm-sentence-encoder-large¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 355.1M | 1024 | 512 | 1.5 GB | 2023-07-12 | dan-Latn |
Citation
@article{enevoldsenScandinavianEmbeddingBenchmarks2024,
title = {The {Scandinavian} {Embedding} {Benchmarks}: {Comprehensive} {Assessment} of {Multilingual} and {Monolingual} {Text} {Embedding}},
shorttitle = {The {Scandinavian} {Embedding} {Benchmarks}},
url = {https://openreview.net/forum?id=pJl_i7HIA72},
language = {en},
urldate = {2024-04-12},
author = {Enevoldsen, Kenneth and Kardos, Márton and Muennighoff, Niklas and Nielbo, Kristoffer},
month = feb,
year = {2024},
}
KennethEnevoldsen/dfm-sentence-encoder-medium¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 124.4M | 768 | 512 | 475.0 MB | 2023-07-12 | dan-Latn |
Citation
@article{enevoldsenScandinavianEmbeddingBenchmarks2024,
title = {The {Scandinavian} {Embedding} {Benchmarks}: {Comprehensive} {Assessment} of {Multilingual} and {Monolingual} {Text} {Embedding}},
shorttitle = {The {Scandinavian} {Embedding} {Benchmarks}},
url = {https://openreview.net/forum?id=pJl_i7HIA72},
language = {en},
urldate = {2024-04-12},
author = {Enevoldsen, Kenneth and Kardos, Márton and Muennighoff, Niklas and Nielbo, Kristoffer},
month = feb,
year = {2024},
}
Kowshik24/bangla-sentence-transformer-ft-matryoshka-paraphrase-multilingual-mpnet-base-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 128 | 1.0 GB | 2025-11-10 | ben-Beng |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
Lajavaness/bilingual-embedding-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 514 | 1.0 GB | 2024-06-26 | not specified |
Citation
@article{DBLP:journals/corr/abs-1911-02116,
author = {Alexis Conneau and
Kartikay Khandelwal and
Naman Goyal and
Vishrav Chaudhary and
Guillaume Wenzek and
Francisco Guzm{'{a}}n and
Edouard Grave and
Myle Ott and
Luke Zettlemoyer and
Veselin Stoyanov},
title = {Unsupervised Cross-lingual Representation Learning at Scale},
journal = {CoRR},
volume = {abs/1911.02116},
year = {2019},
url = {http://arxiv.org/abs/1911.02116},
eprinttype = {arXiv},
eprint = {1911.02116},
timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{reimers2019sentence,
title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
author={Nils Reimers, Iryna Gurevych},
journal={https://arxiv.org/abs/1908.10084},
year={2019}
}
@article{thakur2020augmented,
title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
journal={arXiv e-prints},
pages={arXiv--2010},
year={2020}
}
Lajavaness/bilingual-embedding-large¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 559.9M | 1024 | 514 | 2.1 GB | 2024-06-24 | eng-Latn, fra-Latn |
Citation
@article{DBLP:journals/corr/abs-1911-02116,
author = {Alexis Conneau and
Kartikay Khandelwal and
Naman Goyal and
Vishrav Chaudhary and
Guillaume Wenzek and
Francisco Guzm{'{a}}n and
Edouard Grave and
Myle Ott and
Luke Zettlemoyer and
Veselin Stoyanov},
title = {Unsupervised Cross-lingual Representation Learning at Scale},
journal = {CoRR},
volume = {abs/1911.02116},
year = {2019},
url = {http://arxiv.org/abs/1911.02116},
eprinttype = {arXiv},
eprint = {1911.02116},
timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{reimers2019sentence,
title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
author={Nils Reimers, Iryna Gurevych},
journal={https://arxiv.org/abs/1908.10084},
year={2019}
}
@article{thakur2020augmented,
title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
journal={arXiv e-prints},
pages={arXiv--2010},
year={2020}
}
Lajavaness/bilingual-embedding-small¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 117.7M | 384 | 512 | 449.0 MB | 2024-07-17 | eng-Latn, fra-Latn |
Citation
@article{DBLP:journals/corr/abs-1911-02116,
author = {Alexis Conneau and
Kartikay Khandelwal and
Naman Goyal and
Vishrav Chaudhary and
Guillaume Wenzek and
Francisco Guzm{'{a}}n and
Edouard Grave and
Myle Ott and
Luke Zettlemoyer and
Veselin Stoyanov},
title = {Unsupervised Cross-lingual Representation Learning at Scale},
journal = {CoRR},
volume = {abs/1911.02116},
year = {2019},
url = {http://arxiv.org/abs/1911.02116},
eprinttype = {arXiv},
eprint = {1911.02116},
timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{reimers2019sentence,
title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
author={Nils Reimers, Iryna Gurevych},
journal={https://arxiv.org/abs/1908.10084},
year={2019}
}
@article{thakur2020augmented,
title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
journal={arXiv e-prints},
pages={arXiv--2010},
year={2020}
}
MCINext/Hakim¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 124.4M | 768 | 512 | 475.0 MB | 2025-05-10 | fas-Arab |
Citation
@article{sarmadi2025hakim,
title={Hakim: Farsi Text Embedding Model},
author={Sarmadi, Mehran and Alikhani, Morteza and Zinvandi, Erfan and Pourbahman, Zahra},
journal={arXiv preprint arXiv:2505.08435},
year={2025}
}
MCINext/Hakim-small¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 38.7M | 512 | 512 | 148.0 MB | 2025-05-10 | fas-Arab |
Citation
@article{sarmadi2025hakim,
title={Hakim: Farsi Text Embedding Model},
author={Sarmadi, Mehran and Alikhani, Morteza and Zinvandi, Erfan and Pourbahman, Zahra},
journal={arXiv preprint arXiv:2505.08435},
year={2025}
}
MCINext/Hakim-unsup¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 124.4M | 768 | 512 | 475.0 MB | 2025-05-10 | fas-Arab |
Citation
@article{sarmadi2025hakim,
title={Hakim: Farsi Text Embedding Model},
author={Sarmadi, Mehran and Alikhani, Morteza and Zinvandi, Erfan and Pourbahman, Zahra},
journal={arXiv preprint arXiv:2505.08435},
year={2025}
}
Mihaiii/Bulbasaur¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 17.4M | 384 | 512 | 66.0 MB | 2024-04-27 | not specified |
Mihaiii/Ivysaur¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 384 | 512 | 87.0 MB | 2024-04-27 | not specified |
Mihaiii/Squirtle¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 15.6M | 384 | 512 | 60.0 MB | 2024-04-30 | not specified |
Mihaiii/Venusaur¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 15.6M | 384 | 512 | 60.0 MB | 2024-04-29 | not specified |
Mihaiii/Wartortle¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 17.4M | 384 | 512 | 66.0 MB | 2024-04-30 | not specified |
Mihaiii/gte-micro¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 17.4M | 384 | 512 | 66.0 MB | 2024-04-21 | not specified |
Mihaiii/gte-micro-v4¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 19.2M | 384 | 512 | 73.0 MB | 2024-04-22 | not specified |
Mira190/Euler-Legal-Embedding-V1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.6B | 4096 | 1.5K | 15.3 GB | 2025-11-06 | eng-Latn |
Citation
@misc{euler2025legal,
title={Euler-Legal-Embedding: Advanced Legal Representation Learning},
author={LawRank Team},
year={2025},
publisher={Hugging Face}
}
NbAiLab/nb-bert-base¶
License: cc-by-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 177.9M | 768 | 512 | 681.0 MB | 2021-01-13 | nno-Latn, nob-Latn |
NbAiLab/nb-bert-large¶
License: cc-by-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 355.1M | 1024 | 512 | 1.3 GB | 2021-04-29 | nno-Latn, nob-Latn |
NbAiLab/nb-sbert-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 177.9M | 4096 | 75 | 678.0 MB | 2022-11-23 | dan-Latn, nno-Latn, nob-Latn, swe-Latn |
NeuML/pubmedbert-base-embeddings-100K¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 100.0K | 64 | Infinite | 0.0 MB | 2025-01-03 | eng-Latn |
NeuML/pubmedbert-base-embeddings-1M¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.0M | 64 | Infinite | 2.0 MB | 2025-01-03 | eng-Latn |
NeuML/pubmedbert-base-embeddings-2M¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.9M | 64 | Infinite | 7.0 MB | 2025-01-03 | eng-Latn |
NeuML/pubmedbert-base-embeddings-500K¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 500.0K | 64 | Infinite | 2.0 MB | 2025-01-03 | eng-Latn |
NeuML/pubmedbert-base-embeddings-8M¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.8M | 256 | Infinite | 30.0 MB | 2025-01-03 | eng-Latn |
Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 135.2M | 768 | 512 | 516.0 MB | 2024-06-16 | ara-Arab |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 117.7M | 384 | 512 | 449.0 MB | 2024-06-25 | ara-Arab |
Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 135.2M | 768 | 768 | 516.0 MB | 2024-07-28 | ara-Arab |
Citation
@article{nacar2025gate,
title={GATE: General Arabic Text Embedding for Enhanced Semantic Textual Similarity with Matryoshka Representation Learning and Hybrid Loss Training},
author={Nacar, Omer and Koubaa, Anis and Sibaee, Serry and Al-Habashi, Yasser and Ammar, Adel and Boulila, Wadii},
journal={arXiv preprint arXiv:2505.24581},
year={2025}
}
Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 514 | 1.0 GB | 2024-06-14 | ara-Arab |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
Omartificial-Intelligence-Space/Arabic-labse-Matryoshka¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 470.9M | 768 | 512 | 1.8 GB | 2024-06-16 | ara-Arab |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | 768 | 514 | 418.0 MB | 2024-06-15 | ara-Arab |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 162.8M | 768 | 512 | 621.0 MB | 2024-06-17 | ara-Arab |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
OpenSearch-AI/Ops-MoA-Conan-embedding-v1¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 343.4M | 1536 | 512 | 1.3 GB | 2025-03-26 | zho-Hans |
OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 343.4M | 1536 | 512 | 1.2 GB | 2025-03-26 | zho-Hans |
OrdalieTech/Solon-embeddings-large-0.1¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 559.9M | 1024 | 514 | 2.1 GB | 2023-12-09 | fra-Latn |
OrdalieTech/Solon-embeddings-mini-beta-1.1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 211.8M | 768 | 8.2K | 808.0 MB | 2025-01-01 | fra-Latn |
PartAI/Tooka-SBERT¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 353.0M | 1024 | 512 | 1.3 GB | 2024-12-07 | fas-Arab |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
PartAI/Tooka-SBERT-V2-Large¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 353.0M | 1024 | 512 | 1.3 GB | 2025-05-01 | fas-Arab |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
PartAI/Tooka-SBERT-V2-Small¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 122.9M | 768 | 512 | 496.0 MB | 2025-05-01 | fas-Arab |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
PartAI/TookaBERT-Base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 122.9M | 768 | 512 | 469.0 MB | 2024-12-08 | fas-Arab |
Qodo/Qodo-Embed-1-1.5B¶
License: https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.5B | 1536 | 32.8K | 6.6 GB | 2025-02-19 | c#-Code, c++-Code, go-Code, java-Code, javascript-Code, ... (9) |
Qodo/Qodo-Embed-1-7B¶
License: https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 3584 | 32.8K | 28.4 GB | 2025-02-24 | c#-Code, c++-Code, go-Code, java-Code, javascript-Code, ... (9) |
Querit/Querit¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.9B | 1024 | 4.1K | 9.2 GB | 2026-03-08 | eng-Latn |
Shuu12121/CodeSearch-ModernBERT-Crow-Plus¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 151.7M | 768 | 1.0K | 607.0 MB | 2025-04-21 | eng-Latn |
TencentBAC/Conan-embedding-v1¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 326.0M | 768 | 512 | 1.2 GB | 2024-08-22 | zho-Hans |
Citation
@misc{li2024conanembeddinggeneraltextembedding,
title={Conan-embedding: General Text Embedding with More and Better Negative Samples},
author={Shiyu Li and Yang Tang and Shizhe Chen and Xi Chen},
year={2024},
eprint={2408.15710},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2408.15710},
}
VoVanPhuc/sup-SimCSE-VietNamese-phobert-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 135.0M | 768 | 256 | 517.0 MB | 2021-05-26 | vie-Latn |
Citation
@article{gao2021simcse,
title={{SimCSE}: Simple Contrastive Learning of Sentence Embeddings},
author={Gao, Tianyu and Yao, Xingcheng and Chen, Danqi},
journal={arXiv preprint arXiv:2104.08821},
year={2021}
}
@inproceedings{phobert,
title = {{PhoBERT: Pre-trained language models for Vietnamese}},
author = {Dat Quoc Nguyen and Anh Tuan Nguyen},
booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2020},
year = {2020},
pages = {1037--1042}
}
aari1995/German_Semantic_STS_V2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.7M | 1024 | 512 | 1.3 GB | 2022-11-17 | deu-Latn |
abhinand/MedEmbed-small-v0.1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 384 | 512 | 127.0 MB | 2024-10-20 | eng-Latn |
Citation
@software{balachandran2024medembed,
author = {Balachandran, Abhinand},
title = {MedEmbed: Medical-Focused Embedding Models},
year = {2024},
url = {https://github.com/abhinand5/MedEmbed}
}
ai-forever/sbert_large_mt_nlu_ru¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 427.0M | 1024 | 512 | 1.6 GB | 2021-05-18 | rus-Cyrl |
ai-forever/sbert_large_nlu_ru¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 427.0M | 1024 | 512 | 1.6 GB | 2020-11-20 | rus-Cyrl |
amazon/Titan-text-embeddings-v2¶
License: https://aws.amazon.com/service-terms/ • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | not specified | 2024-04-30 | eng-Latn |
andersborges/model2vecdk¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 48.0M | 256 | Infinite | 183.0 MB | 2025-11-21 | dan-Latn |
Citation
@article{minishlab2024model2vec,
author = {Tulkens, Stephan and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
url = {https://github.com/MinishLab/model2vec}
}
andersborges/model2vecdk-stem¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 48.6M | 256 | Infinite | 185.0 MB | 2025-11-21 | dan-Latn |
Citation
@article{minishlab2024model2vec,
author = {Tulkens, Stephan and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
url = {https://github.com/MinishLab/model2vec}
}
avsolatorio/GIST-Embedding-v0¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | 768 | 512 | 418.0 MB | 2024-01-31 | eng-Latn |
Citation
@article{solatorio2024gistembed,
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
author={Aivin V. Solatorio},
journal={arXiv preprint arXiv:2402.16829},
year={2024},
URL={https://arxiv.org/abs/2402.16829}
eprint={2402.16829},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
avsolatorio/GIST-all-MiniLM-L6-v2¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 384 | 512 | 87.0 MB | 2024-02-03 | eng-Latn |
Citation
@article{solatorio2024gistembed,
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
author={Aivin V. Solatorio},
journal={arXiv preprint arXiv:2402.16829},
year={2024},
URL={https://arxiv.org/abs/2402.16829}
eprint={2402.16829},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
avsolatorio/GIST-large-Embedding-v0¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.1M | 1024 | 512 | 1.2 GB | 2024-02-14 | eng-Latn |
Citation
@article{solatorio2024gistembed,
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
author={Aivin V. Solatorio},
journal={arXiv preprint arXiv:2402.16829},
year={2024},
URL={https://arxiv.org/abs/2402.16829}
eprint={2402.16829},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
avsolatorio/GIST-small-Embedding-v0¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 384 | 512 | 127.0 MB | 2024-02-03 | eng-Latn |
Citation
@article{solatorio2024gistembed,
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
author={Aivin V. Solatorio},
journal={arXiv preprint arXiv:2402.16829},
year={2024},
URL={https://arxiv.org/abs/2402.16829}
eprint={2402.16829},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
avsolatorio/NoInstruct-small-Embedding-v0¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 384 | 512 | 127.0 MB | 2024-05-01 | eng-Latn |
baseline/Human¶
License: not specified
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | not specified | not specified | ara-Arab, dan-Latn, eng-Latn, nob-Latn, rus-Cyrl |
bedrock/amazon-titan-embed-text-v1¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1536 | 8.2K | not specified | 2023-09-27 | not specified |
bedrock/amazon-titan-embed-text-v2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 8.2K | not specified | 2024-04-30 | not specified |
bigscience/sgpt-bloom-7b1-msmarco¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | 4096 | not specified | not specified | 2022-08-26 | not specified |
Citation
@article{muennighoff2022sgpt,
title={SGPT: GPT Sentence Embeddings for Semantic Search},
author={Muennighoff, Niklas},
journal={arXiv preprint arXiv:2202.08904},
year={2022}
}
bisectgroup/BiCA-base¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | 768 | 512 | 418.0 MB | 2025-11-14 | eng-Latn |
Citation
@misc{sinha2025bicaeffectivebiomedicaldense,
title={BiCA: Effective Biomedical Dense Retrieval with Citation-Aware Hard Negatives},
author={Aarush Sinha and Pavan Kumar S and Roshan Balaji and Nirav Pravinbhai Bhatt},
year={2025},
eprint={2511.08029},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2511.08029},
}
bkai-foundation-models/vietnamese-bi-encoder¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 135.0M | 768 | 256 | 515.0 MB | 2023-09-09 | vie-Latn |
Citation
@article{duc2024towards,
title={Towards Comprehensive Vietnamese Retrieval-Augmented Generation and Large Language Models},
author={Nguyen Quang Duc, Le Hai Son, Nguyen Duc Nhan, Nguyen Dich Nhat Minh, Le Thanh Huong, Dinh Viet Sang},
journal={arXiv preprint arXiv:2403.01616},
year={2024}
}
brahmairesearch/slx-v0.1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 384 | 512 | 87.0 MB | 2024-08-13 | eng-Latn |
castorini/monot5-3b-msmarco-10k¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 3.0B | not specified | not specified | not specified | 2022-03-28 | eng-Latn |
Citation
@misc{rosa2022parameterleftbehinddistillation,
title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
year={2022},
eprint={2206.02873},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2206.02873},
}
castorini/monot5-base-msmarco-10k¶
License: not specified
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 296.9M | not specified | not specified | not specified | 2022-03-28 | eng-Latn |
Citation
@misc{rosa2022parameterleftbehinddistillation,
title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
year={2022},
eprint={2206.02873},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2206.02873},
}
castorini/monot5-large-msmarco-10k¶
License: not specified
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | not specified | 2022-03-28 | eng-Latn |
Citation
@misc{rosa2022parameterleftbehinddistillation,
title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
year={2022},
eprint={2206.02873},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2206.02873},
}
castorini/monot5-small-msmarco-10k¶
License: not specified
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | not specified | 2022-03-28 | eng-Latn |
Citation
@misc{rosa2022parameterleftbehinddistillation,
title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
year={2022},
eprint={2206.02873},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2206.02873},
}
codesage/codesage-base-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 354.7M | 1024 | 2.0K | 1.3 GB | 2024-02-03 | go-Code, java-Code, javascript-Code, php-Code, python-Code, ... (6) |
Citation
@inproceedings{
zhang2024code,
title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
booktitle={The Twelfth International Conference on Learning Representations},
year={2024},
url={https://openreview.net/forum?id=vfzRRjumpX}
}
codesage/codesage-large-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.3B | 2048 | 2.0K | 4.8 GB | 2024-02-03 | go-Code, java-Code, javascript-Code, php-Code, python-Code, ... (6) |
Citation
@inproceedings{
zhang2024code,
title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
booktitle={The Twelfth International Conference on Learning Representations},
year={2024},
url={https://openreview.net/forum?id=vfzRRjumpX}
}
codesage/codesage-small-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 128.0M | 1024 | 2.0K | 496.0 MB | 2024-02-03 | go-Code, java-Code, javascript-Code, php-Code, python-Code, ... (6) |
Citation
@inproceedings{
zhang2024code,
title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
booktitle={The Twelfth International Conference on Learning Representations},
year={2024},
url={https://openreview.net/forum?id=vfzRRjumpX}
}
cointegrated/LaBSE-en-ru¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 129.0M | 768 | 512 | 492.0 MB | 2021-06-10 | rus-Cyrl |
cointegrated/rubert-tiny¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 11.9M | 312 | 512 | 45.0 MB | 2021-05-24 | rus-Cyrl |
cointegrated/rubert-tiny2¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 29.4M | 312 | 2.0K | 112.0 MB | 2021-10-28 | rus-Cyrl |
colbert-ir/colbertv2.0¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | not specified | 180 | 418.0 MB | 2024-09-21 | eng-Latn |
consciousAI/cai-lunaris-text-embeddings¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.1M | 1024 | 512 | not specified | 2023-06-22 | not specified |
consciousAI/cai-stellaris-text-embeddings¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 768 | 514 | not specified | 2023-06-23 | not specified |
contextboxai/halong_embedding¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 514 | 1.0 GB | 2024-07-06 | vie-Latn |
Citation
@misc{HalongEmbedding,
title={HalongEmbedding: A Vietnamese Text Embedding},
author={Ngo Hieu},
year={2024},
publisher={Huggingface},
}
cross-encoder/ms-marco-MiniLM-L12-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 384 | 512 | 127.0 MB | 2021-04-16 | eng-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
cross-encoder/ms-marco-MiniLM-L2-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 15.6M | 384 | 512 | 60.0 MB | 2021-04-16 | eng-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
cross-encoder/ms-marco-MiniLM-L4-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 19.2M | 384 | 512 | 73.0 MB | 2021-04-16 | eng-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
cross-encoder/ms-marco-MiniLM-L6-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 384 | 512 | 87.0 MB | 2021-04-16 | eng-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
cross-encoder/ms-marco-TinyBERT-L2-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.4M | 128 | 512 | 17.0 MB | 2021-04-16 | eng-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
deepfile/embedder-100p¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 514 | 1.0 GB | 2023-07-24 | not specified |
deepvk/USER-bge-m3¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 359.0M | 1024 | 8.2K | 1.3 GB | 2024-07-05 | rus-Cyrl |
Citation
@misc{deepvk2024user,
title={USER: Universal Sentence Encoder for Russian},
author={Malashenko, Boris and Zemerov, Anton and Spirin, Egor},
url={https://huggingface.co/datasets/deepvk/USER-base},
publisher={Hugging Face},
year={2024},
}
deepvk/deberta-v1-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 124.0M | 768 | 512 | 473.0 MB | 2023-02-07 | rus-Cyrl |
dmedhi/PawanEmbd-68M¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 67.8M | 768 | 512 | 260.0 MB | 2025-12-08 | eng-Latn |
Citation
@misc{medhi2025pawanembd,
title={PawanEmbd-68M: Distilled Embedding Model},
author={Medhi, D.},
year={2025},
url={https://huggingface.co/dmedhi/PawanEmbd-68M}
}
dunzhang/stella-large-zh-v3-1792d¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 327.4M | 1792 | 512 | not specified | 2024-02-17 | zho-Hans |
dunzhang/stella-mrl-large-zh-v3.5-1792d¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 326.0M | 1792 | 512 | 1.2 GB | 2024-02-27 | zho-Hans |
dwzhu/e5-base-4k¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 112.2M | not specified | 4.1K | not specified | 2024-03-28 | eng-Latn |
Citation
@article{zhu2024longembed,
title={LongEmbed: Extending Embedding Models for Long Context Retrieval},
author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian},
journal={arXiv preprint arXiv:2404.12096},
year={2024}
}
facebook/SONAR¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 512 | not specified | 2021-05-21 | ace-Arab, ace-Latn, acm-Arab, acq-Arab, aeb-Arab, ... (204) |
Citation
@misc{Duquenne:2023:sonar_arxiv,
author = {Paul-Ambroise Duquenne and Holger Schwenk and Benoit Sagot},
title = {{SONAR:} Sentence-Level Multimodal and Language-Agnostic Representations},
publisher = {arXiv},
year = {2023},
url = {https://arxiv.org/abs/2308.11466},
}
facebook/contriever-msmarco¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | 768 | 512 | 572.0 MB | 2022-06-25 | eng-Latn |
Citation
@misc{izacard2021contriever,
title={Unsupervised Dense Information Retrieval with Contrastive Learning},
author={Gautier Izacard and Mathilde Caron and Lucas Hosseini and Sebastian Riedel and Piotr Bojanowski and Armand Joulin and Edouard Grave},
year={2021},
url = {https://arxiv.org/abs/2112.09118},
doi = {10.48550/ARXIV.2112.09118},
}
fangxq/XYZ-embedding¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 325.5M | 768 | 512 | 1.2 GB | 2024-09-13 | zho-Hans |
geoffsee/auto-g-embed-st¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 384 | 256 | 87.0 MB | 2026-02-08 | eng-Latn |
google/flan-t5-base¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 247.6M | not specified | not specified | 944.0 MB | 2022-10-21 | eng-Latn |
Citation
@misc{10.48550/arxiv.2210.11416,
doi = {10.48550/ARXIV.2210.11416},
url = {https://arxiv.org/abs/2210.11416},
author = {Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and Webson, Albert and Gu, Shixiang Shane and Dai, Zhuyun and Suzgun, Mirac and Chen, Xinyun and Chowdhery, Aakanksha and Narang, Sharan and Mishra, Gaurav and Yu, Adams and Zhao, Vincent and Huang, Yanping and Dai, Andrew and Yu, Hongkun and Petrov, Slav and Chi, Ed H. and Dean, Jeff and Devlin, Jacob and Roberts, Adam and Zhou, Denny and Le, Quoc V. and Wei, Jason},
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Scaling Instruction-Finetuned Language Models},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
google/flan-t5-large¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 783.2M | not specified | not specified | 2.9 GB | 2022-10-21 | eng-Latn |
Citation
@misc{10.48550/arxiv.2210.11416,
doi = {10.48550/ARXIV.2210.11416},
url = {https://arxiv.org/abs/2210.11416},
author = {Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and Webson, Albert and Gu, Shixiang Shane and Dai, Zhuyun and Suzgun, Mirac and Chen, Xinyun and Chowdhery, Aakanksha and Narang, Sharan and Mishra, Gaurav and Yu, Adams and Zhao, Vincent and Huang, Yanping and Dai, Andrew and Yu, Hongkun and Petrov, Slav and Chi, Ed H. and Dean, Jeff and Devlin, Jacob and Roberts, Adam and Zhou, Denny and Le, Quoc V. and Wei, Jason},
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Scaling Instruction-Finetuned Language Models},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
google/flan-t5-xl¶
License: not specified
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | 10.6 GB | 2022-10-21 | eng-Latn |
Citation
@misc{10.48550/arxiv.2210.11416,
doi = {10.48550/ARXIV.2210.11416},
url = {https://arxiv.org/abs/2210.11416},
author = {Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and Webson, Albert and Gu, Shixiang Shane and Dai, Zhuyun and Suzgun, Mirac and Chen, Xinyun and Chowdhery, Aakanksha and Narang, Sharan and Mishra, Gaurav and Yu, Adams and Zhao, Vincent and Huang, Yanping and Dai, Andrew and Yu, Hongkun and Petrov, Slav and Chi, Ed H. and Dean, Jeff and Devlin, Jacob and Roberts, Adam and Zhou, Denny and Le, Quoc V. and Wei, Jason},
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Scaling Instruction-Finetuned Language Models},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
google/flan-t5-xxl¶
License: not specified
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | 42.0 GB | 2022-10-21 | eng-Latn |
Citation
@misc{10.48550/arxiv.2210.11416,
doi = {10.48550/ARXIV.2210.11416},
url = {https://arxiv.org/abs/2210.11416},
author = {Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and Webson, Albert and Gu, Shixiang Shane and Dai, Zhuyun and Suzgun, Mirac and Chen, Xinyun and Chowdhery, Aakanksha and Narang, Sharan and Mishra, Gaurav and Yu, Adams and Zhao, Vincent and Huang, Yanping and Dai, Andrew and Yu, Hongkun and Petrov, Slav and Chi, Ed H. and Dean, Jeff and Devlin, Jacob and Roberts, Adam and Zhou, Denny and Le, Quoc V. and Wei, Jason},
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Scaling Instruction-Finetuned Language Models},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
iampanda/zpoint_large_embedding_zh¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 326.0M | 1792 | 512 | 1.2 GB | 2024-06-04 | zho-Hans |
ibm-granite/granite-embedding-107m-multilingual¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 107.0M | 384 | 512 | 204.0 MB | 2024-12-18 | ara-Latn, ces-Latn, deu-Latn, eng-Latn, fra-Latn, ... (13) |
Citation
@article{awasthy2025graniteembedding,
title={Granite Embedding Models},
author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
journal={arXiv preprint arXiv:2502.20204},
year={2025}
}
ibm-granite/granite-embedding-125m-english¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 124.6M | 768 | 512 | 238.0 MB | 2024-12-18 | eng-Latn |
Citation
@article{awasthy2025graniteembedding,
title={Granite Embedding Models},
author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
journal={arXiv preprint arXiv:2502.20204},
year={2025}
}
ibm-granite/granite-embedding-278m-multilingual¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 512 | 530.0 MB | 2024-12-18 | ara-Latn, ces-Latn, deu-Latn, eng-Latn, fra-Latn, ... (13) |
Citation
@article{awasthy2025graniteembedding,
title={Granite Embedding Models},
author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
journal={arXiv preprint arXiv:2502.20204},
year={2025}
}
ibm-granite/granite-embedding-30m-english¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 30.3M | 384 | 512 | 58.0 MB | 2024-12-18 | eng-Latn |
Citation
@article{awasthy2025graniteembedding,
title={Granite Embedding Models},
author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
journal={arXiv preprint arXiv:2502.20204},
year={2025}
}
ibm-granite/granite-embedding-english-r2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 768 | 8.2K | 284.0 MB | 2025-08-15 | eng-Latn |
Citation
@article{awasthy2025graniteembedding,
title={Granite Embedding Models},
author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
journal={arXiv preprint arXiv:2502.20204},
year={2025}
}
ibm-granite/granite-embedding-small-english-r2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 47.7M | 384 | 8.2K | 91.0 MB | 2025-08-15 | eng-Latn |
Citation
@article{awasthy2025graniteembedding,
title={Granite Embedding Models},
author={Awasthy, Parul and Trivedi, Aashka and Li, Yulong and Bornea, Mihaela and Cox, David and Daniels, Abraham and Franz, Martin and Goodhart, Gabe and Iyer, Bhavani and Kumar, Vishwajeet and Lastras, Luis and McCarley, Scott and Murthy, Rudra and P, Vignesh and Rosenthal, Sara and Roukos, Salim and Sen, Jaydeep and Sharma, Sukriti and Sil, Avirup and Soule, Kate and Sultan, Arafat and Florian, Radu},
journal={arXiv preprint arXiv:2502.20204},
year={2025}
}
infgrad/stella-base-en-v2¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | not specified | 512 | not specified | 2023-10-19 | eng-Latn |
infgrad/stella-base-zh-v3-1792d¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 104.0M | 1792 | 512 | not specified | 2024-02-17 | zho-Hans |
izhx/udever-bloom-1b1¶
License: https://huggingface.co/spaces/bigscience/license • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.1B | not specified | not specified | not specified | 2023-10-24 | aka-Latn, ara-Arab, asm-Beng, bam-Latn, ben-Beng, ... (45) |
Citation
@article{zhang2023language,
title={Language Models are Universal Embedders},
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
journal={arXiv preprint arXiv:2310.08232},
year={2023}
}
izhx/udever-bloom-3b¶
License: https://huggingface.co/spaces/bigscience/license • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 3.0B | not specified | not specified | not specified | 2023-10-24 | aka-Latn, ara-Arab, asm-Beng, bam-Latn, ben-Beng, ... (45) |
Citation
@article{zhang2023language,
title={Language Models are Universal Embedders},
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
journal={arXiv preprint arXiv:2310.08232},
year={2023}
}
izhx/udever-bloom-560m¶
License: https://huggingface.co/spaces/bigscience/license • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 559.2M | not specified | not specified | not specified | 2023-10-24 | aka-Latn, ara-Arab, asm-Beng, bam-Latn, ben-Beng, ... (45) |
Citation
@article{zhang2023language,
title={Language Models are Universal Embedders},
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
journal={arXiv preprint arXiv:2310.08232},
year={2023}
}
izhx/udever-bloom-7b1¶
License: https://huggingface.co/spaces/bigscience/license • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | not specified | 2023-10-24 | aka-Latn, ara-Arab, asm-Beng, bam-Latn, ben-Beng, ... (45) |
Citation
@article{zhang2023language,
title={Language Models are Universal Embedders},
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
journal={arXiv preprint arXiv:2310.08232},
year={2023}
}
jhu-clsp/FollowIR-7B¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.1B | not specified | not specified | 13.5 GB | 2024-04-29 | eng-Latn |
Citation
@misc{weller2024followir,
title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions},
author={Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini},
year={2024},
eprint={2403.15246},
archivePrefix={arXiv},
primaryClass={cs.IR}
}
jinaai/jina-colbert-v2¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 559.4M | not specified | 8.2K | 1.0 GB | 2024-08-16 | ara-Arab, ben-Beng, deu-Latn, eng-Latn, fas-Arab, ... (22) |
Citation
@inproceedings{xiao-etal-2024-jina,
title = "{J}ina-{C}ol{BERT}-v2: A General-Purpose Multilingual Late Interaction Retriever",
author = {Jha, Rohan and
Wang, Bo and
G{"u}nther, Michael and
Mastrapas, Georgios and
Sturua, Saba and
Mohr, Isabelle and
Koukounas, Andreas and
Wang, Mohammad Kalim and
Wang, Nan and
Xiao, Han},
editor = {S{"a}lev{"a}, Jonne and
Owodunni, Abraham},
booktitle = "Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024)",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.mrl-1.11/",
doi = "10.18653/v1/2024.mrl-1.11",
pages = "159--166",
abstract = "Multi-vector dense models, such as ColBERT, have proven highly effective in information retrieval. ColBERT`s late interaction scoring approximates the joint query-document attention seen in cross-encoders while maintaining inference efficiency closer to traditional dense retrieval models, thanks to its bi-encoder architecture and recent optimizations in indexing and search. In this paper, we introduce a novel architecture and a training framework to support long context window and multilingual retrieval. Leveraging Matryoshka Representation Loss, we further demonstrate that the reducing the embedding dimensionality from 128 to 64 has insignificant impact on the model`s retrieval performance and cut storage requirements by up to 50{\%}. Our new model, Jina-ColBERT-v2, demonstrates strong performance across a range of English and multilingual retrieval tasks,"
}
jinaai/jina-embedding-b-en-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.6M | 768 | 512 | 420.0 MB | 2023-07-07 | eng-Latn |
Citation
@misc{günther2023jina,
title={Jina Embeddings: A Novel Set of High-Performance Sentence Embedding Models},
author={Michael Günther and Louis Milliken and Jonathan Geuter and Georgios Mastrapas and Bo Wang and Han Xiao},
year={2023},
eprint={2307.11224},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
jinaai/jina-embedding-s-en-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 35.3M | 512 | 512 | 134.0 MB | 2023-07-07 | eng-Latn |
Citation
@misc{günther2023jina,
title={Jina Embeddings: A Novel Set of High-Performance Sentence Embedding Models},
author={Michael Günther and Louis Milliken and Jonathan Geuter and Georgios Mastrapas and Bo Wang and Han Xiao},
year={2023},
eprint={2307.11224},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
jinaai/jina-embeddings-v2-base-en¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 137.4M | 768 | 8.2K | 262.0 MB | 2023-09-27 | eng-Latn |
Citation
@misc{günther2023jina,
title={Jina Embeddings 2: 8192-Token General-Purpose Text Embeddings for Long Documents},
author={Michael Günther and Jackmin Ong and Isabelle Mohr and Alaeddine Abdessalem and Tanguy Abel and Mohammad Kalim Akram and Susana Guzman and Georgios Mastrapas and Saba Sturua and Bo Wang and Maximilian Werk and Nan Wang and Han Xiao},
year={2023},
eprint={2310.19923},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
jinaai/jina-embeddings-v2-small-en¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 32.7M | 512 | 8.2K | 62.0 MB | 2023-09-27 | eng-Latn |
Citation
@misc{günther2023jina,
title={Jina Embeddings 2: 8192-Token General-Purpose Text Embeddings for Long Documents},
author={Michael Günther and Jackmin Ong and Isabelle Mohr and Alaeddine Abdessalem and Tanguy Abel and Mohammad Kalim Akram and Susana Guzman and Georgios Mastrapas and Saba Sturua and Bo Wang and Maximilian Werk and Nan Wang and Han Xiao},
year={2023},
eprint={2310.19923},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
jinaai/jina-reranker-v2-base-multilingual¶
License: not specified
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | 531.0 MB | 2024-09-26 | eng-Latn |
jinaai/jina-reranker-v3¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 596.8M | not specified | 131.1K | 1.1 GB | 2025-09-18 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{wang2025jinarerankerv3lateinteractionlistwise,
title={jina-reranker-v3: Last but Not Late Interaction for Listwise Document Reranking},
author={Feng Wang and Yuqing Li and Han Xiao},
year={2025},
eprint={2509.25085},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2509.25085},}
keeeeenw/MicroLlama-text-embedding¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 271.9M | 1024 | 2.0K | 1.0 GB | 2024-11-10 | eng-Latn |
lier007/xiaobu-embedding¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 326.0M | 1024 | 512 | 1.2 GB | 2024-01-09 | zho-Hans |
lier007/xiaobu-embedding-v2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 326.0M | 768 | 512 | 1.2 GB | 2024-06-30 | zho-Hans |
lightonai/ColBERT-Zero¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 128 | 8.2K | 568.0 MB | 2026-02-19 | eng-Latn |
Citation
@misc{chaffin2026colbertzeropretrainpretraincolbert,
title = {ColBERT-Zero: To Pre-train Or Not To Pre-train ColBERT models},
author = {Antoine Chaffin and Luca Arnaboldi and Amélie Chatelain and Florent Krzakala},
year = {2026},
eprint = {2602.16609},
archivePrefix = {arXiv},
primaryClass = {cs.CL},
url = {https://arxiv.org/abs/2602.16609},
}
lightonai/ColBERT-Zero-supervised¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 128 | 8.2K | 568.0 MB | 2026-02-19 | eng-Latn |
Citation
@misc{chaffin2026colbertzeropretrainpretraincolbert,
title = {ColBERT-Zero: To Pre-train Or Not To Pre-train ColBERT models},
author = {Antoine Chaffin and Luca Arnaboldi and Amélie Chatelain and Florent Krzakala},
year = {2026},
eprint = {2602.16609},
archivePrefix = {arXiv},
primaryClass = {cs.CL},
url = {https://arxiv.org/abs/2602.16609},
}
lightonai/ColBERT-Zero-unsupervised¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 128 | 8.2K | 568.0 MB | 2026-02-19 | eng-Latn |
Citation
@misc{chaffin2026colbertzeropretrainpretraincolbert,
title = {ColBERT-Zero: To Pre-train Or Not To Pre-train ColBERT models},
author = {Antoine Chaffin and Luca Arnaboldi and Amélie Chatelain and Florent Krzakala},
year = {2026},
eprint = {2602.16609},
archivePrefix = {arXiv},
primaryClass = {cs.CL},
url = {https://arxiv.org/abs/2602.16609},
}
lightonai/GTE-ModernColBERT-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 128 | 8.2K | 568.0 MB | 2025-04-30 | eng-Latn |
Citation
@misc{GTE-ModernColBERT,
title={GTE-ModernColBERT},
author={Chaffin, Antoine},
url={https://huggingface.co/lightonai/GTE-ModernColBERT-v1},
year={2025}
}
lightonai/LateOn-Code¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 128 | 8.2K | 568.0 MB | 2026-02-12 | eng-Latn, go-Code, java-Code, javascript-Code, php-Code, ... (7) |
Citation
@misc{LateOn-Code,
title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
author = {Chaffin, Antoine},
url = {https://huggingface.co/collections/lightonai/lateon-code},
year = {2026}
}
lightonai/LateOn-Code-edge¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 16.8M | 48 | 8.0K | 64.0 MB | 2026-02-12 | eng-Latn, go-Code, java-Code, javascript-Code, php-Code, ... (7) |
Citation
@misc{LateOn-Code,
title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
author = {Chaffin, Antoine},
url = {https://huggingface.co/collections/lightonai/lateon-code},
year = {2026}
}
lightonai/LateOn-Code-edge-pretrain¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 16.8M | 48 | 8.0K | 64.0 MB | 2026-02-12 | eng-Latn, go-Code, java-Code, javascript-Code, php-Code, ... (7) |
Citation
@misc{LateOn-Code,
title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
author = {Chaffin, Antoine},
url = {https://huggingface.co/collections/lightonai/lateon-code},
year = {2026}
}
lightonai/LateOn-Code-pretrain¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 128 | 8.2K | 568.0 MB | 2026-02-12 | eng-Latn, go-Code, java-Code, javascript-Code, php-Code, ... (7) |
Citation
@misc{LateOn-Code,
title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
author = {Chaffin, Antoine},
url = {https://huggingface.co/collections/lightonai/lateon-code},
year = {2026}
}
lightonai/Reason-ModernColBERT¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 149.0M | 128 | 8.2K | 568.0 MB | 2025-05-22 | eng-Latn |
Citation
@misc{Reason-ModernColBERT,
title={Reason-ModernColBERT},
author={Chaffin, Antoine},
url={https://huggingface.co/lightonai/Reason-ModernColBERT},
year={2025}
}
llmrails/ember-v1¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.0M | 1024 | 512 | 1.2 GB | 2023-10-10 | eng-Latn |
Citation
@misc{nur2024emberv1,
title={ember-v1: SOTA embedding model},
author={Enrike Nur and Anar Aliyev},
year={2023},
}
m3hrdadfi/bert-zwnj-wnli-mean-tokens¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 118.3M | 768 | 512 | 451.0 MB | 2021-06-28 | fas-Arab |
m3hrdadfi/roberta-zwnj-wnli-mean-tokens¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 118.3M | 768 | 514 | 451.0 MB | 2021-06-28 | fas-Arab |
malenia1/ternary-weight-embedding¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 98.7M | 1024 | 512 | 158.0 MB | 2024-10-23 | not specified |
manu/bge-m3-custom-fr¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 567.8M | 1024 | 8.2K | 2.1 GB | 2024-04-11 | not specified |
manu/sentence_croissant_alpha_v0.2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.3B | 2048 | 2.0K | 2.4 GB | 2024-03-15 | not specified |
manu/sentence_croissant_alpha_v0.3¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.3B | 2048 | 2.0K | 2.4 GB | 2024-04-26 | not specified |
manu/sentence_croissant_alpha_v0.4¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.3B | 2048 | 2.0K | 2.4 GB | 2024-04-27 | eng-Latn, fra-Latn |
meta-llama/Llama-2-7b-chat-hf¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.0B | not specified | not specified | not specified | 2023-07-18 | eng-Latn |
Citation
@misc{touvron2023llama2openfoundation,
title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
year={2023},
eprint={2307.09288},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2307.09288},
}
meta-llama/Llama-2-7b-hf¶
License: not specified
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | not specified | 2023-07-18 | eng-Latn |
Citation
@misc{touvron2023llama2openfoundation,
title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
year={2023},
eprint={2307.09288},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2307.09288},
}
microsoft/speecht5_tts¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 146.3M | 768 | not specified | 558.0 MB | 2022-05-16 | eng-Latn |
Citation
@misc{ao2022speecht5unifiedmodalencoderdecoderpretraining,
title={SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing},
author={Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
year={2022},
eprint={2110.07205},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2110.07205},
}
minishlab/M2V_base_glove¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 102.0M | 256 | Infinite | 391.0 MB | 2024-09-21 | eng-Latn |
Citation
@software{minishlab2024model2vec,
author = {Stephan Tulkens and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
publisher = {Zenodo},
doi = {10.5281/zenodo.17270888},
url = {https://github.com/MinishLab/model2vec},
license = {MIT}
}
minishlab/M2V_base_glove_subword¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 103.0M | 256 | Infinite | 391.0 MB | 2024-09-21 | eng-Latn |
Citation
@software{minishlab2024model2vec,
author = {Stephan Tulkens and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
publisher = {Zenodo},
doi = {10.5281/zenodo.17270888},
url = {https://github.com/MinishLab/model2vec},
license = {MIT}
}
minishlab/M2V_base_output¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.6M | 256 | Infinite | 29.0 MB | 2024-09-21 | eng-Latn |
Citation
@software{minishlab2024model2vec,
author = {Stephan Tulkens and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
publisher = {Zenodo},
doi = {10.5281/zenodo.17270888},
url = {https://github.com/MinishLab/model2vec},
license = {MIT}
}
minishlab/M2V_multilingual_output¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 128.0M | 256 | Infinite | 489.0 MB | 2024-09-21 | eng-Latn |
Citation
@software{minishlab2024model2vec,
author = {Stephan Tulkens and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
publisher = {Zenodo},
doi = {10.5281/zenodo.17270888},
url = {https://github.com/MinishLab/model2vec},
license = {MIT}
}
minishlab/potion-base-2M¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 2.0M | 64 | Infinite | 7.0 MB | 2024-10-29 | eng-Latn |
Citation
@software{minishlab2024model2vec,
author = {Stephan Tulkens and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
publisher = {Zenodo},
doi = {10.5281/zenodo.17270888},
url = {https://github.com/MinishLab/model2vec},
license = {MIT}
}
minishlab/potion-base-32M¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 32.3M | 512 | Infinite | 123.0 MB | 2025-01-22 | eng-Latn |
Citation
@software{minishlab2024model2vec,
author = {Stephan Tulkens and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
publisher = {Zenodo},
doi = {10.5281/zenodo.17270888},
url = {https://github.com/MinishLab/model2vec},
license = {MIT}
}
minishlab/potion-base-4M¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 3.8M | 128 | Infinite | 14.0 MB | 2024-10-29 | eng-Latn |
Citation
@software{minishlab2024model2vec,
author = {Stephan Tulkens and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
publisher = {Zenodo},
doi = {10.5281/zenodo.17270888},
url = {https://github.com/MinishLab/model2vec},
license = {MIT}
}
minishlab/potion-base-8M¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.6M | 256 | Infinite | 29.0 MB | 2024-10-29 | eng-Latn |
Citation
@software{minishlab2024model2vec,
author = {Stephan Tulkens and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
publisher = {Zenodo},
doi = {10.5281/zenodo.17270888},
url = {https://github.com/MinishLab/model2vec},
license = {MIT}
}
minishlab/potion-multilingual-128M¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 128.0M | 256 | Infinite | 489.0 MB | 2025-05-23 | afr-Latn, amh-Ethi, ara-Arab, aze-Latn, bel-Cyrl, ... (101) |
Citation
@software{minishlab2024model2vec,
author = {Stephan Tulkens and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
publisher = {Zenodo},
doi = {10.5281/zenodo.17270888},
url = {https://github.com/MinishLab/model2vec},
license = {MIT}
}
minishlab/potion-retrieval-32M¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 32.3M | 512 | Infinite | 123.0 MB | 2025-01-23 | eng-Latn |
Citation
@software{minishlab2024model2vec,
author = {Stephan Tulkens and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
publisher = {Zenodo},
doi = {10.5281/zenodo.17270888},
url = {https://github.com/MinishLab/model2vec},
license = {MIT}
}
mistralai/Mistral-7B-Instruct-v0.2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 7.2B | not specified | not specified | not specified | 2023-12-11 | eng-Latn |
Citation
@misc{jiang2023mistral7b,
title={Mistral 7B},
author={Albert Q. Jiang and Alexandre Sablayrolles and Arthur Mensch and Chris Bamford and Devendra Singh Chaplot and Diego de las Casas and Florian Bressand and Gianna Lengyel and Guillaume Lample and Lucile Saulnier and Lélio Renard Lavaud and Marie-Anne Lachaux and Pierre Stock and Teven Le Scao and Thibaut Lavril and Thomas Wang and Timothée Lacroix and William El Sayed},
year={2023},
eprint={2310.06825},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2310.06825},
}
mixedbread-ai/mxbai-edge-colbert-v0-17m¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 17.0M | not specified | 8.0K | 64.0 MB | 2025-10-16 | eng-Latn |
Citation
@misc{takehi2025fantasticsmallretrieverstrain,
title={Fantastic (small) Retrievers and How to Train Them: mxbai-edge-colbert-v0 Tech Report},
author={Rikiya Takehi and Benjamin Clavié and Sean Lee and Aamir Shakir},
year={2025},
eprint={2510.14880},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2510.14880},
}
mixedbread-ai/mxbai-edge-colbert-v0-32m¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 32.0M | not specified | 511 | 122.0 MB | 2025-10-16 | eng-Latn |
Citation
@misc{takehi2025fantasticsmallretrieverstrain,
title={Fantastic (small) Retrievers and How to Train Them: mxbai-edge-colbert-v0 Tech Report},
author={Rikiya Takehi and Benjamin Clavié and Sean Lee and Aamir Shakir},
year={2025},
eprint={2510.14880},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2510.14880},
}
mixedbread-ai/mxbai-rerank-base-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 184.4M | not specified | 512 | 352.0 MB | 2024-02-29 | eng-Latn |
Citation
@online{rerank2024mxbai,
title={Boost Your Search With The Crispy Mixedbread Rerank Models},
author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
year={2024},
url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
}
mixedbread-ai/mxbai-rerank-large-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 435.1M | not specified | 512 | 830.0 MB | 2024-02-29 | eng-Latn |
Citation
@online{rerank2024mxbai,
title={Boost Your Search With The Crispy Mixedbread Rerank Models},
author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
year={2024},
url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
}
mixedbread-ai/mxbai-rerank-xsmall-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 70.8M | not specified | 512 | 135.0 MB | 2024-02-29 | eng-Latn |
Citation
@online{rerank2024mxbai,
title={Boost Your Search With The Crispy Mixedbread Rerank Models},
author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
year={2024},
url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
}
moka-ai/m3e-base¶
License: https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 102.3M | 768 | 512 | 390.0 MB | 2023-06-06 | eng-Latn, zho-Hans |
Citation
@software{MokaMassiveMixedEmbedding,
author = {Wang Yuxin and Sun Qingxuan and He Sicheng},
title = {M3E: Moka Massive Mixed Embedding Model},
year = {2023}
}
moka-ai/m3e-large¶
License: https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 325.5M | 768 | 512 | not specified | 2023-06-21 | eng-Latn, zho-Hans |
Citation
@software{MokaMassiveMixedEmbedding,
author = {Wang Yuxin and Sun Qingxuan and He Sicheng},
title = {M3E: Moka Massive Mixed Embedding Model},
year = {2023}
}
moka-ai/m3e-small¶
License: https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 24.0M | 512 | 512 | not specified | 2023-06-02 | eng-Latn, zho-Hans |
Citation
@software{MokaMassiveMixedEmbedding,
author = {Wang Yuxin and Sun Qingxuan and He Sicheng},
title = {M3E: Moka Massive Mixed Embedding Model},
year = {2023}
}
mteb/baseline-bb25¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | not specified | 2026-02-06 | not specified |
Citation
@software{jeong2026bayesianbm25,
title={Bayesian BM25: A Probabilistic Framework for Hybrid Text and Vector Search},
author={Jeong, Jaepil},
year={2026},
doi={10.5281/zenodo.18414941},
url={https://doi.org/10.5281/zenodo.18414941},
}
@software{jeong2026neural,
title={From Bayesian Inference to Neural Computation: The Analytical Emergence of Neural Network Structure from Probabilistic Relevance Estimation},
author={Jeong, Jaepil},
year={2026},
doi={10.5281/zenodo.18512411},
url={https://doi.org/10.5281/zenodo.18512411},
}
mteb/baseline-bm25s¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | not specified | 2024-07-10 | eng-Latn |
Citation
@misc{bm25s,
title={BM25S: Orders of magnitude faster lexical search via eager sparse scoring},
author={Xing Han Lù},
year={2024},
eprint={2407.03618},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2407.03618},
}
myrkur/sentence-transformer-parsbert-fa¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 162.8M | 768 | 512 | 621.0 MB | 2024-12-10 | fas-Arab |
nvidia/llama-nemotron-rerank-1b-v2¶
License: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/ • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.2B | 2048 | 4.1K | 2.3 GB | 2025-10-16 | eng-Latn |
omarelshehy/arabic-english-sts-matryoshka¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 559.9M | 1024 | 514 | 2.1 GB | 2024-10-13 | ara-Arab, eng-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
openai/text-embedding-3-large¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 3072 | 8.2K | not specified | 2024-01-25 | not specified |
openai/text-embedding-3-large (embed_dim=512)¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 512 | 8.2K | not specified | 2024-01-25 | not specified |
openai/text-embedding-3-small¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1536 | 8.2K | not specified | 2024-01-25 | not specified |
openai/text-embedding-3-small (embed_dim=512)¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 512 | 8.2K | not specified | 2024-01-25 | not specified |
openai/text-embedding-ada-002¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1536 | 8.2K | not specified | 2022-12-15 | not specified |
openbmb/MiniCPM-Embedding¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 2.7B | 2304 | 512 | 5.1 GB | 2024-09-04 | eng-Latn, zho-Hans |
panalexeu/xlm-roberta-ua-distilled¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 512 | 1.0 GB | 2025-04-15 | eng-Latn, ukr-Cyrl |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
perplexity-ai/pplx-embed-v1-0.6b¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 596.0M | 1024 | 32.8K | 2.2 GB | 2026-02-11 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{eslami2026diffusionpretraineddensecontextualembeddings,
title={Diffusion-Pretrained Dense and Contextual Embeddings},
author={Sedigheh Eslami and Maksim Gaiduk and Markus Krimmel and Louis Milliken and Bo Wang and Denis Bykov},
year={2026},
eprint={2602.11151},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2602.11151},
}
perplexity-ai/pplx-embed-v1-4b¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 2560 | 32.8K | 15.0 GB | 2026-02-11 | afr-Latn, ara-Arab, aze-Latn, bel-Cyrl, ben-Beng, ... (73) |
Citation
@misc{eslami2026diffusionpretraineddensecontextualembeddings,
title={Diffusion-Pretrained Dense and Contextual Embeddings},
author={Sedigheh Eslami and Maksim Gaiduk and Markus Krimmel and Louis Milliken and Bo Wang and Denis Bykov},
year={2026},
eprint={2602.11151},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2602.11151},
}
prdev/mini-gte¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 66.4M | 768 | 512 | 253.0 MB | 2025-01-28 | eng-Latn |
rasgaard/m2v-dfm-large¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.9M | 256 | Infinite | 87.0 MB | 2025-10-08 | dan-Latn |
Citation
@article{minishlab2024model2vec,
author = {Tulkens, Stephan and {van Dongen}, Thomas},
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
year = {2024},
url = {https://github.com/MinishLab/model2vec}
}
richinfoai/ritrieve_zh_v1¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 326.0M | 1792 | 512 | 1.2 GB | 2025-03-25 | zho-Hans |
sbintuitions/sarashina-embedding-v1-1b¶
License: https://huggingface.co/sbintuitions/sarashina-embedding-v1-1b/blob/main/LICENSE • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.2B | 1792 | 8.2K | 4.6 GB | 2024-11-22 | jpn-Jpan |
sbunlp/fabert¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 124.4M | 768 | 512 | 475.0 MB | 2024-10-07 | fas-Arab |
Citation
@inproceedings{masumi-etal-2025-fabert,
title = "{F}a{BERT}: Pre-training {BERT} on {P}ersian Blogs",
author = "Masumi, Mostafa and
Majd, Seyed Soroush and
Shamsfard, Mehrnoush and
Beigy, Hamid",
editor = "Bak, JinYeong and
Goot, Rob van der and
Jang, Hyeju and
Buaphet, Weerayut and
Ramponi, Alan and
Xu, Wei and
Ritter, Alan",
booktitle = "Proceedings of the Tenth Workshop on Noisy and User-generated Text",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wnut-1.10/",
doi = "10.18653/v1/2025.wnut-1.10",
pages = "85--96",
ISBN = "979-8-89176-232-9",
}
sdadas/mmlw-e5-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 514 | 1.0 GB | 2023-11-17 | pol-Latn |
Citation
@article{dadas2024pirb,
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
year={2024},
eprint={2402.13350},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
sdadas/mmlw-e5-large¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 559.9M | 1024 | 514 | 2.1 GB | 2023-11-17 | pol-Latn |
Citation
@article{dadas2024pirb,
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
year={2024},
eprint={2402.13350},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
sdadas/mmlw-e5-small¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 117.7M | 384 | 512 | 449.0 MB | 2023-11-17 | pol-Latn |
Citation
@article{dadas2024pirb,
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
year={2024},
eprint={2402.13350},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
sdadas/mmlw-roberta-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 124.4M | 768 | 514 | 475.0 MB | 2023-11-17 | pol-Latn |
Citation
@article{dadas2024pirb,
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
year={2024},
eprint={2402.13350},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
sdadas/mmlw-roberta-large¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 435.0M | 1024 | 514 | 1.6 GB | 2023-11-17 | pol-Latn |
Citation
@article{dadas2024pirb,
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
year={2024},
eprint={2402.13350},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
sensenova/piccolo-base-zh¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 102.3M | 768 | 512 | not specified | 2023-09-04 | zho-Hans |
sensenova/piccolo-large-zh-v2¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 512 | not specified | 2024-04-22 | zho-Hans |
Citation
@misc{2405.06932,
Author = {Junqin Huang and Zhongjie Hu and Zihao Jing and Mengya Gao and Yichao Wu},
Title = {Piccolo2: General Text Embedding with Multi-task Hybrid Loss Training},
Year = {2024},
Eprint = {arXiv:2405.06932},
}
sentence-transformers/LaBSE¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 470.9M | 768 | 512 | 1.8 GB | 2019-11-01 | ara-Arab, bul-Cyrl, cat-Latn, ces-Latn, dan-Latn, ... (53) |
Citation
@misc{feng2022languageagnosticbertsentenceembedding,
title={Language-agnostic BERT Sentence Embedding},
author={Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang},
year={2022},
eprint={2007.01852},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2007.01852},
}
sentence-transformers/all-MiniLM-L12-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 384 | 256 | 127.0 MB | 2021-08-30 | eng-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "http://arxiv.org/abs/1908.10084",
}
sentence-transformers/all-MiniLM-L6-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 384 | 256 | 87.0 MB | 2021-08-30 | eng-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "http://arxiv.org/abs/1908.10084",
}
sentence-transformers/all-mpnet-base-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | 768 | 384 | 418.0 MB | 2021-08-30 | eng-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "http://arxiv.org/abs/1908.10084",
}
sentence-transformers/gtr-t5-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 110.2M | 768 | 512 | 209.0 MB | 2022-02-09 | eng-Latn |
Citation
@misc{ni2021largedualencodersgeneralizable,
title={Large Dual Encoders Are Generalizable Retrievers},
author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
year={2021},
eprint={2112.07899},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2112.07899},
}
sentence-transformers/gtr-t5-large¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.7M | 768 | 512 | 639.0 MB | 2022-02-09 | eng-Latn |
Citation
@misc{ni2021largedualencodersgeneralizable,
title={Large Dual Encoders Are Generalizable Retrievers},
author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
year={2021},
eprint={2112.07899},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2112.07899},
}
sentence-transformers/gtr-t5-xl¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.2B | 768 | 512 | 2.3 GB | 2022-02-09 | eng-Latn |
Citation
@misc{ni2021largedualencodersgeneralizable,
title={Large Dual Encoders Are Generalizable Retrievers},
author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
year={2021},
eprint={2112.07899},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2112.07899},
}
sentence-transformers/gtr-t5-xxl¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.9B | 768 | 512 | 9.1 GB | 2022-02-09 | eng-Latn |
Citation
@misc{ni2021largedualencodersgeneralizable,
title={Large Dual Encoders Are Generalizable Retrievers},
author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
year={2021},
eprint={2112.07899},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2112.07899},
}
sentence-transformers/multi-qa-MiniLM-L6-cos-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 22.7M | 384 | 512 | 87.0 MB | 2021-08-30 | eng-Latn |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "http://arxiv.org/abs/1908.10084",
}
sentence-transformers/multi-qa-mpnet-base-dot-v1¶
License: not specified • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | 768 | 512 | 418.0 MB | 2021-08-23 | eng-Latn |
sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 117.7M | 384 | 512 | 449.0 MB | 2019-11-01 | ara-Arab, bul-Cyrl, cat-Latn, ces-Latn, dan-Latn, ... (53) |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "http://arxiv.org/abs/1908.10084",
}
sentence-transformers/paraphrase-multilingual-mpnet-base-v2¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 278.0M | 768 | 512 | 1.0 GB | 2019-11-01 | ara-Arab, bul-Cyrl, cat-Latn, ces-Latn, dan-Latn, ... (53) |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "http://arxiv.org/abs/1908.10084",
}
sentence-transformers/sentence-t5-base¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 222.9M | 768 | 512 | 209.0 MB | 2022-02-09 | eng-Latn |
Citation
@misc{ni2021sentencet5scalablesentenceencoders,
title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models},
author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
year={2021},
eprint={2108.08877},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2108.08877},
}
sentence-transformers/sentence-t5-large¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.7M | 768 | 512 | 639.0 MB | 2022-02-09 | eng-Latn |
Citation
@misc{ni2021sentencet5scalablesentenceencoders,
title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models},
author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
year={2021},
eprint={2108.08877},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2108.08877},
}
sentence-transformers/sentence-t5-xl¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.2B | 768 | 512 | 2.3 GB | 2024-03-27 | eng-Latn |
Citation
@misc{ni2021sentencet5scalablesentenceencoders,
title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models},
author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
year={2021},
eprint={2108.08877},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2108.08877},
}
sentence-transformers/sentence-t5-xxl¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.9B | 768 | 512 | 9.1 GB | 2024-03-27 | eng-Latn |
Citation
@misc{ni2021sentencet5scalablesentenceencoders,
title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models},
author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
year={2021},
eprint={2108.08877},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2108.08877},
}
sentence-transformers/static-retrieval-mrl-en-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 31.3M | 1024 | Infinite | 119.0 MB | 2024-10-24 | eng-Latn |
sentence-transformers/static-similarity-mrl-multilingual-v1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 108.4M | 1024 | not specified | 413.0 MB | 2025-01-15 | ara-Arab, bul-Cyrl, cat-Latn, ces-Latn, dan-Latn, ... (49) |
Citation
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
sergeyzh/LaBSE-ru-turbo¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 129.0M | 768 | 512 | 490.0 MB | 2024-06-27 | rus-Cyrl |
sergeyzh/rubert-tiny-turbo¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 29.2M | 312 | 2.0K | 111.0 MB | 2024-06-21 | rus-Cyrl |
shibing624/text2vec-base-chinese¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 102.3M | 768 | 512 | 390.0 MB | 2022-01-23 | zho-Hans |
Citation
@software{text2vec,
author = {Xu Ming},
title = {text2vec: A Tool for Text to Vector},
year = {2022},
url = {https://github.com/shibing624/text2vec},
}
shibing624/text2vec-base-chinese-paraphrase¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 117.9M | 768 | 512 | 450.0 MB | 2023-06-19 | zho-Hans |
Citation
@software{text2vec,
author = {Xu Ming},
title = {text2vec: A Tool for Text to Vector},
year = {2022},
url = {https://github.com/shibing624/text2vec},
}
shibing624/text2vec-base-multilingual¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 117.7M | 384 | 256 | 449.0 MB | 2023-06-22 | deu-Latn, eng-Latn, fra-Latn, ita-Latn, nld-Latn, ... (10) |
Citation
@software{text2vec,
author = {Xu Ming},
title = {text2vec: A Tool for Text to Vector},
year = {2022},
url = {https://github.com/shibing624/text2vec},
}
silma-ai/silma-embeddding-matryoshka-v0.1¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 135.2M | 768 | 512 | 516.0 MB | 2024-10-12 | ara-Arab, eng-Latn |
Citation
@misc{silma2024embedding,
author = {Abu Bakr Soliman, Karim Ouda, SILMA AI},
title = {SILMA Embedding Matryoshka 0.1},
year = {2024},
publisher = {Hugging Face},
howpublished = {https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1},
}
spartan8806/atles-champion-embedding¶
License: apache-2.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | 768 | 512 | 420.0 MB | 2025-11-15 | eng-Latn |
Citation
@article{conner2025epistemic,
title={The Epistemic Barrier: How RLHF Makes AI Consciousness Empirically Undecidable},
author={Conner (spartan8806)},
journal={ATLES Research Papers},
year={2025},
note={Cross-model validation study (Phoenix, Grok, Gemini, Claude)}
}
stephantulkens/NIFE-gte-modernbert-base_as_router¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 225.8M | 768 | 8.2K | 861.0 MB | 2025-10-30 | eng-Latn |
Citation
@software{Tulkens2025pyNIFE,
author = {St'{e}phan Tulkens},
title = {pyNIFE: nearly inference free embeddings in python},
year = {2025},
publisher = {Zenodo},
doi = {10.5281/zenodo.17512919},
url = {https://github.com/stephantul/pynife},
license = {MIT},
}
stephantulkens/NIFE-mxbai-embed-large-v1_as_router¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 445.7M | 1024 | 512 | 1.7 GB | 2025-11-03 | eng-Latn |
Citation
@software{Tulkens2025pyNIFE,
author = {St'{e}phan Tulkens},
title = {pyNIFE: nearly inference free embeddings in python},
year = {2025},
publisher = {Zenodo},
doi = {10.5281/zenodo.17512919},
url = {https://github.com/stephantul/pynife},
license = {MIT},
}
thenlper/gte-base¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 109.5M | 768 | 512 | 209.0 MB | 2023-07-27 | eng-Latn |
Citation
@article{li2023towards,
title={Towards general text embeddings with multi-stage contrastive learning},
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
journal={arXiv preprint arXiv:2308.03281},
year={2023}
}
thenlper/gte-base-zh¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 102.3M | 1024 | 512 | 195.0 MB | 2023-11-08 | zho-Hans |
Citation
@article{li2023towards,
title={Towards general text embeddings with multi-stage contrastive learning},
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
journal={arXiv preprint arXiv:2308.03281},
year={2023}
}
thenlper/gte-large¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.1M | 1024 | 512 | 639.0 MB | 2023-07-27 | eng-Latn |
Citation
@article{li2023towards,
title={Towards general text embeddings with multi-stage contrastive learning},
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
journal={arXiv preprint arXiv:2308.03281},
year={2023}
}
thenlper/gte-large-zh¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 325.5M | 1024 | 512 | 621.0 MB | 2023-11-08 | zho-Hans |
Citation
@article{li2023towards,
title={Towards general text embeddings with multi-stage contrastive learning},
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
journal={arXiv preprint arXiv:2308.03281},
year={2023}
}
thenlper/gte-small¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 33.4M | 384 | 512 | 64.0 MB | 2023-07-27 | eng-Latn |
Citation
@article{li2023towards,
title={Towards general text embeddings with multi-stage contrastive learning},
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
journal={arXiv preprint arXiv:2308.03281},
year={2023}
}
thenlper/gte-small-zh¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 30.3M | 1024 | 512 | 58.0 MB | 2023-11-08 | zho-Hans |
Citation
@article{li2023towards,
title={Towards general text embeddings with multi-stage contrastive learning},
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
journal={arXiv preprint arXiv:2308.03281},
year={2023}
}
unicamp-dl/mt5-base-mmarco-v2¶
License: not specified
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| not specified | not specified | not specified | not specified | 2022-01-05 | afr-Latn, amh-Ethi, ara-Arab, aze-Latn, bel-Cyrl, ... (103) |
Citation
@article{DBLP:journals/corr/abs-2108-13897,
author = {Luiz Bonifacio and
Israel Campiotti and
Roberto de Alencar Lotufo and
Rodrigo Frassetto Nogueira},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/journals/corr/abs-2108-13897.bib},
eprint = {2108.13897},
eprinttype = {arXiv},
journal = {CoRR},
timestamp = {Mon, 20 Mar 2023 15:35:34 +0100},
title = {mMARCO: {A} Multilingual Version of {MS} {MARCO} Passage Ranking Dataset},
url = {https://arxiv.org/abs/2108.13897},
volume = {abs/2108.13897},
year = {2021},
}
w601sxs/b1ade-embed¶
License: mit • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 335.1M | 1024 | 4.1K | 1.2 GB | 2025-03-10 | eng-Latn |
Citation
@misc{bigscience_workshop_2022,
author = { {Shreyas Subramanian} },
title = { {b1ade series of models} },
year = 2024,
url = { https://huggingface.co/w601sxs/b1ade-embed },
publisher = { Hugging Face }
}
zeroentropy/zembed-1¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 2560 | 32.8K | 7.5 GB | 2026-03-02 | ara-Arab, deu-Latn, eng-Latn, fin-Latn, fra-Latn, ... (15) |
Citation
@misc{pipitone2025zeloeloinspiredtrainingmethod,
title={zELO: ELO-inspired Training Method for Rerankers and Embedding Models},
author={Nicholas Pipitone and Ghita Houir Alami and Advaith Avadhanam and Anton Kaminskyi and Ashley Khoo},
year={2025},
eprint={2509.12541},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2509.12541},
}
zeroentropy/zerank-1¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 2560 | 41.0K | 7.5 GB | 2025-07-01 | eng-Latn |
Citation
@misc{pipitone2025zeloeloinspiredtrainingmethod,
title={zELO: ELO-inspired Training Method for Rerankers and Embedding Models},
author={Nicholas Pipitone and Ghita Houir Alami and Advaith Avadhanam and Anton Kaminskyi and Ashley Khoo},
year={2025},
eprint={2509.12541},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2509.12541},
}
zeroentropy/zerank-1-small¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 1.7B | 2048 | 41.0K | 3.2 GB | 2025-07-01 | eng-Latn |
Citation
@misc{pipitone2025zeloeloinspiredtrainingmethod,
title={zELO: ELO-inspired Training Method for Rerankers and Embedding Models},
author={Nicholas Pipitone and Ghita Houir Alami and Advaith Avadhanam and Anton Kaminskyi and Ashley Khoo},
year={2025},
eprint={2509.12541},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2509.12541},
}
zeroentropy/zerank-2¶
License: cc-by-nc-4.0 • Learn more →
| Parameters | Emb. Dim | Max Tokens | Memory | Released | Languages |
|---|---|---|---|---|---|
| 4.0B | 2560 | 41.0K | 7.5 GB | 2025-11-19 | eng-Latn |
Citation
@misc{pipitone2025zeloeloinspiredtrainingmethod,
title={zELO: ELO-inspired Training Method for Rerankers and Embedding Models},
author={Nicholas Pipitone and Ghita Houir Alami and Advaith Avadhanam and Anton Kaminskyi and Ashley Khoo},
year={2025},
eprint={2509.12541},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2509.12541},
}