Skip to content

Audio-text Model

10 Models

Instruction Model

LCO-Embedding/LCO-Embedding-Omni-3B

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
4.7B 2048 32.8K 8.8 GB 2025-10-23 eng-Latn
Citation
@misc{xiao2025scalinglanguagecentricomnimodalrepresentation,
  title={Scaling Language-Centric Omnimodal Representation Learning},
  author={Chenghao Xiao and Hou Pong Chan and Hao Zhang and Weiwen Xu and Mahani Aljunied and Yu Rong},
  year={2025},
  eprint={2510.11693},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2510.11693},
}

LCO-Embedding/LCO-Embedding-Omni-7B

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
8.9B 3584 32.8K 16.6 GB 2025-10-15 eng-Latn
Citation
@misc{xiao2025scalinglanguagecentricomnimodalrepresentation,
  title={Scaling Language-Centric Omnimodal Representation Learning},
  author={Chenghao Xiao and Hou Pong Chan and Hao Zhang and Weiwen Xu and Mahani Aljunied and Yu Rong},
  year={2025},
  eprint={2510.11693},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2510.11693},
}

Qwen/Qwen2-Audio-7B

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
7.0B 1280 131.1K not specified 2024-08-09 eng-Latn
Citation
@misc{chu2024qwen2audiotechnicalreport,
      title={Qwen2-Audio Technical Report},
      author={Yunfei Chu and Jin Xu and Qian Yang and Haojie Wei and Xipin Wei and Zhifang Guo and Yichong Leng and Yuanjun Lv and Jinzheng He and Junyang Lin and Chang Zhou and Jingren Zhou},
      year={2024},
      eprint={2407.10759},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2407.10759},
}

Non-instruction Model

OpenMuQ/MuQ-MuLan-large

License: cc-by-nc-4.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
630.0M 512 not specified 2.5 GB 2025-01-01 eng-Latn, zho-Hans
Citation
@misc{zhu2025muqselfsupervisedmusicrepresentation,
  title={MuQ: Self-Supervised Music Representation Learning with Mel Residual Vector Quantization},
  author={Haina Zhu and Yizhi Zhou and Hangting Chen and Jianwei Yu and Ziyang Ma and Rongzhi Gu and Yi Luo and Wei Tan and Xie Chen},
  year={2025},
  eprint={2501.01108},
  archivePrefix={arXiv},
  primaryClass={cs.SD},
  url={https://arxiv.org/abs/2501.01108},
}

laion/clap-htsat-fused

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
153.5M 512 Infinite 586.0 MB 2023-05-22 eng-Latn
Citation
@misc{wu2024largescalecontrastivelanguageaudiopretraining,
      title={Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
      author={Yusong Wu and Ke Chen and Tianyu Zhang and Yuchen Hui and Marianna Nezhurina and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
      year={2024},
      eprint={2211.06687},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2211.06687},
}

laion/clap-htsat-unfused

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
153.5M 512 Infinite 586.0 MB 2023-05-22 eng-Latn
Citation
@misc{wu2024largescalecontrastivelanguageaudiopretraining,
      title={Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
      author={Yusong Wu and Ke Chen and Tianyu Zhang and Yuchen Hui and Marianna Nezhurina and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
      year={2024},
      eprint={2211.06687},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2211.06687},
}

laion/larger_clap_general

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
193.9M 512 Infinite 740.0 MB 2023-05-22 eng-Latn
Citation
@misc{wu2024largescalecontrastivelanguageaudiopretraining,
      title={Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
      author={Yusong Wu and Ke Chen and Tianyu Zhang and Yuchen Hui and Marianna Nezhurina and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
      year={2024},
      eprint={2211.06687},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2211.06687},
}

laion/larger_clap_music

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
193.9M 512 Infinite 740.0 MB 2023-05-22 eng-Latn
Citation
@misc{wu2024largescalecontrastivelanguageaudiopretraining,
      title={Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
      author={Yusong Wu and Ke Chen and Tianyu Zhang and Yuchen Hui and Marianna Nezhurina and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
      year={2024},
      eprint={2211.06687},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2211.06687},
}

laion/larger_clap_music_and_speech

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
193.9M 512 Infinite 740.0 MB 2023-05-22 eng-Latn
Citation
@misc{wu2024largescalecontrastivelanguageaudiopretraining,
      title={Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
      author={Yusong Wu and Ke Chen and Tianyu Zhang and Yuchen Hui and Marianna Nezhurina and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
      year={2024},
      eprint={2211.06687},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2211.06687},
}

lyrebird/wav2clip

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
163.0M 512 not specified 622.0 MB 2022-03-15 eng-Latn
Citation
@misc{wu2022wav2cliplearningrobustaudio,
  title={Wav2CLIP: Learning Robust Audio Representations From CLIP},
  author={Ho-Hsiang Wu and Prem Seetharaman and Kundan Kumar and Juan Pablo Bello},
  year={2022},
  eprint={2110.11499},
  archivePrefix={arXiv},
  primaryClass={cs.SD},
  url={https://arxiv.org/abs/2110.11499},
}

microsoft/msclap-2022

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
196.0M 1024 not specified 750.0 MB 2022-12-01 eng-Latn
Citation
@inproceedings{CLAP2022,
  title={Clap learning audio concepts from natural language supervision},
  author={Elizalde, Benjamin and Deshmukh, Soham and Al Ismail, Mahmoud and Wang, Huaming},
  booktitle={ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={1--5},
  year={2023},
  organization={IEEE}
}

microsoft/msclap-2023

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
160.0M 1024 not specified 610.0 MB 2023-09-01 eng-Latn
Citation
@misc{CLAP2023,
      title={Natural Language Supervision for General-Purpose Audio Representations},
      author={Benjamin Elizalde and Soham Deshmukh and Huaming Wang},
      year={2023},
      eprint={2309.05767},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2309.05767}
}

microsoft/speecht5_multimodal

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
297.9M 768 not specified 1.1 GB 2022-05-16 eng-Latn
Citation
@misc{ao2022speecht5unifiedmodalencoderdecoderpretraining,
      title={SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing},
      author={Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
      year={2022},
      eprint={2110.07205},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2110.07205},
}