Skip to content

Audio-text Model

  • Number of models: 10

Instruction Model

LCO-Embedding/LCO-Embedding-Omni-3B

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 2048 4.7B 8.8 GB 2025-10-23 eng-Latn
Citation
@misc{xiao2025scalinglanguagecentricomnimodalrepresentation,
  title={Scaling Language-Centric Omnimodal Representation Learning},
  author={Chenghao Xiao and Hou Pong Chan and Hao Zhang and Weiwen Xu and Mahani Aljunied and Yu Rong},
  year={2025},
  eprint={2510.11693},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2510.11693},
}

LCO-Embedding/LCO-Embedding-Omni-7B

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
32.8K 3584 8.9B 16.6 GB 2025-10-15 eng-Latn
Citation
@misc{xiao2025scalinglanguagecentricomnimodalrepresentation,
  title={Scaling Language-Centric Omnimodal Representation Learning},
  author={Chenghao Xiao and Hou Pong Chan and Hao Zhang and Weiwen Xu and Mahani Aljunied and Yu Rong},
  year={2025},
  eprint={2510.11693},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2510.11693},
}

Qwen/Qwen2-Audio-7B

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
131.1K 1280 7.0B not specified 2024-08-09 eng-Latn
Citation
@misc{chu2024qwen2audiotechnicalreport,
      title={Qwen2-Audio Technical Report},
      author={Yunfei Chu and Jin Xu and Qian Yang and Haojie Wei and Xipin Wei and Zhifang Guo and Yichong Leng and Yuanjun Lv and Jinzheng He and Junyang Lin and Chang Zhou and Jingren Zhou},
      year={2024},
      eprint={2407.10759},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2407.10759},
}

Non-instruction Model

OpenMuQ/MuQ-MuLan-large

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 512 630.0M 2.5 GB 2025-01-01 eng-Latn, zho-Hans
Citation
@misc{zhu2025muqselfsupervisedmusicrepresentation,
  title={MuQ: Self-Supervised Music Representation Learning with Mel Residual Vector Quantization},
  author={Haina Zhu and Yizhi Zhou and Hangting Chen and Jianwei Yu and Ziyang Ma and Rongzhi Gu and Yi Luo and Wei Tan and Xie Chen},
  year={2025},
  eprint={2501.01108},
  archivePrefix={arXiv},
  primaryClass={cs.SD},
  url={https://arxiv.org/abs/2501.01108},
}

laion/clap-htsat-fused

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 512 153.5M 586.0 MB 2023-05-22 eng-Latn
Citation
@misc{wu2024largescalecontrastivelanguageaudiopretraining,
      title={Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
      author={Yusong Wu and Ke Chen and Tianyu Zhang and Yuchen Hui and Marianna Nezhurina and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
      year={2024},
      eprint={2211.06687},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2211.06687},
}

laion/clap-htsat-unfused

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 512 153.5M 586.0 MB 2023-05-22 eng-Latn
Citation
@misc{wu2024largescalecontrastivelanguageaudiopretraining,
      title={Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
      author={Yusong Wu and Ke Chen and Tianyu Zhang and Yuchen Hui and Marianna Nezhurina and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
      year={2024},
      eprint={2211.06687},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2211.06687},
}

laion/larger_clap_general

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 512 193.9M 740.0 MB 2023-05-22 eng-Latn
Citation
@misc{wu2024largescalecontrastivelanguageaudiopretraining,
      title={Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
      author={Yusong Wu and Ke Chen and Tianyu Zhang and Yuchen Hui and Marianna Nezhurina and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
      year={2024},
      eprint={2211.06687},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2211.06687},
}

laion/larger_clap_music

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 512 193.9M 740.0 MB 2023-05-22 eng-Latn
Citation
@misc{wu2024largescalecontrastivelanguageaudiopretraining,
      title={Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
      author={Yusong Wu and Ke Chen and Tianyu Zhang and Yuchen Hui and Marianna Nezhurina and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
      year={2024},
      eprint={2211.06687},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2211.06687},
}

laion/larger_clap_music_and_speech

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 512 193.9M 740.0 MB 2023-05-22 eng-Latn
Citation
@misc{wu2024largescalecontrastivelanguageaudiopretraining,
      title={Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
      author={Yusong Wu and Ke Chen and Tianyu Zhang and Yuchen Hui and Marianna Nezhurina and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
      year={2024},
      eprint={2211.06687},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2211.06687},
}

lyrebird/wav2clip

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 512 163.0M 622.0 MB 2022-03-15 eng-Latn
Citation
@misc{wu2022wav2cliplearningrobustaudio,
  title={Wav2CLIP: Learning Robust Audio Representations From CLIP},
  author={Ho-Hsiang Wu and Prem Seetharaman and Kundan Kumar and Juan Pablo Bello},
  year={2022},
  eprint={2110.11499},
  archivePrefix={arXiv},
  primaryClass={cs.SD},
  url={https://arxiv.org/abs/2110.11499},
}

microsoft/msclap-2022

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1024 196.0M 750.0 MB 2022-12-01 eng-Latn
Citation
@inproceedings{CLAP2022,
  title={Clap learning audio concepts from natural language supervision},
  author={Elizalde, Benjamin and Deshmukh, Soham and Al Ismail, Mahmoud and Wang, Huaming},
  booktitle={ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={1--5},
  year={2023},
  organization={IEEE}
}

microsoft/msclap-2023

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1024 160.0M 610.0 MB 2023-09-01 eng-Latn
Citation
@misc{CLAP2023,
      title={Natural Language Supervision for General-Purpose Audio Representations},
      author={Benjamin Elizalde and Soham Deshmukh and Huaming Wang},
      year={2023},
      eprint={2309.05767},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2309.05767}
}

microsoft/speecht5_multimodal

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 768 297.9M 1.1 GB 2022-05-16 eng-Latn
Citation
@misc{ao2022speecht5unifiedmodalencoderdecoderpretraining,
      title={SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing},
      author={Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
      year={2022},
      eprint={2110.07205},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2110.07205},
}