Skip to content

Audio Model

  • Number of models: 41

Non-instruction Model

MIT/ast-finetuned-audioset-10-10-0.4593

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 768 86.6M 330.0 MB 2021-07-08 eng-Latn
Citation
@misc{gong2021astaudiospectrogramtransformer,
      title={AST: Audio Spectrogram Transformer},
      author={Yuan Gong and Yu-An Chung and James Glass},
      year={2021},
      eprint={2104.01778},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2104.01778},
}

asapp/sew-d-base-plus-400k-ft-ls100h

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 95.0M 675.0 MB 2021-09-14 eng-Latn
Citation
@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
      title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
      author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
      year={2021},
      eprint={2109.06870},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2109.06870},
}

asapp/sew-d-mid-400k-ft-ls100h

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 139.0M 530.0 MB 2021-09-14 eng-Latn
Citation
@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
      title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
      author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
      year={2021},
      eprint={2109.06870},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2109.06870},
}

asapp/sew-d-tiny-100k-ft-ls100h

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 256 19.7M 92.0 MB 2021-09-14 eng-Latn
Citation
@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
      title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
      author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
      year={2021},
      eprint={2109.06870},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2109.06870},
}

facebook/data2vec-audio-base-960h

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 768 93.2M 355.0 MB 2022-02-07 eng-Latn
Citation
@misc{baevski2022data2vecgeneralframeworkselfsupervised,
    title={data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language},
    author={Alexei Baevski and Wei-Ning Hsu and Qiantong Xu and Arun Babu and Jiatao Gu and Michael Auli},
    year={2022},
    eprint={2202.03555},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2202.03555},
}

facebook/data2vec-audio-large-960h

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1024 313.3M 1.2 GB 2022-02-07 eng-Latn
Citation
@misc{baevski2022data2vecgeneralframeworkselfsupervised,
    title={data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language},
    author={Alexei Baevski and Wei-Ning Hsu and Qiantong Xu and Arun Babu and Jiatao Gu and Michael Auli},
    year={2022},
    eprint={2202.03555},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2202.03555},
}

facebook/encodec_24khz

License: cc-by-nc-4.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 128 23.3M 88.0 MB 2022-10-25 eng-Latn
Citation
@misc{défossez2022highfidelityneuralaudio,
      title={High Fidelity Neural Audio Compression},
      author={Alexandre Défossez and Jade Copet and Gabriel Synnaeve and Yossi Adi},
      year={2022},
      eprint={2210.13438},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2210.13438},
}

facebook/hubert-base-ls960

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 95.0M 360.0 MB 2021-06-14 eng-Latn
Citation
@misc{hsu2021hubertselfsupervisedspeechrepresentation,
    title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
    author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
    year={2021},
    eprint={2106.07447},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2106.07447},
}

facebook/hubert-large-ls960-ft

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 317.0M 1.2 GB 2021-06-14 eng-Latn
Citation
@misc{hsu2021hubertselfsupervisedspeechrepresentation,
    title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
    author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
    year={2021},
    eprint={2106.07447},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2106.07447},
}

facebook/mms-1b-all

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1024 1.0B 3.6 GB 2023-05-22 ara-Arab, cmn-Hans, deu-Latn, eng-Latn, fra-Latn, ... (7)
Citation
@misc{pratap2023scalingspeechtechnology1000,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  year={2023},
  eprint={2305.13516},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2305.13516},
}

facebook/mms-1b-fl102

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1024 1.0B 3.6 GB 2023-05-22 eng-Latn
Citation
@misc{pratap2023scalingspeechtechnology1000,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  year={2023},
  eprint={2305.13516},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2305.13516},
}

facebook/mms-1b-l1107

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1024 1.0B 3.6 GB 2023-05-22 eng-Latn
Citation
@misc{pratap2023scalingspeechtechnology1000,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  year={2023},
  eprint={2305.13516},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2305.13516},
}

facebook/seamless-m4t-v2-large

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1024 2.3B 8.6 GB 2023-11-06 eng-Latn
Citation
@misc{communication2023seamlessmultilingualexpressivestreaming,
      title={Seamless: Multilingual Expressive and Streaming Speech Translation},
      author={Seamless Communication and Loïc Barrault and Yu-An Chung and Mariano Coria Meglioli and David Dale and Ning Dong and Mark Duppenthaler and Paul-Ambroise Duquenne and Brian Ellis and Hady Elsahar and Justin Haaheim and John Hoffman and Min-Jae Hwang and Hirofumi Inaguma and Christopher Klaiber and Ilia Kulikov and Pengwei Li and Daniel Licht and Jean Maillard and Ruslan Mavlyutov and Alice Rakotoarison and Kaushik Ram Sadagopan and Abinesh Ramakrishnan and Tuan Tran and Guillaume Wenzek and Yilin Yang and Ethan Ye and Ivan Evtimov and Pierre Fernandez and Cynthia Gao and Prangthip Hansanti and Elahe Kalbassi and Amanda Kallet and Artyom Kozhevnikov and Gabriel Mejia Gonzalez and Robin San Roman and Christophe Touret and Corinne Wong and Carleigh Wood and Bokai Yu and Pierre Andrews and Can Balioglu and Peng-Jen Chen and Marta R. Costa-jussà and Maha Elbayad and Hongyu Gong and Francisco Guzmán and Kevin Heffernan and Somya Jain and Justine Kao and Ann Lee and Xutai Ma and Alex Mourachko and Benjamin Peloquin and Juan Pino and Sravya Popuri and Christophe Ropers and Safiyyah Saleem and Holger Schwenk and Anna Sun and Paden Tomasello and Changhan Wang and Jeff Wang and Skyler Wang and Mary Williamson},
      year={2023},
      eprint={2312.05187},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2312.05187},
}

facebook/wav2vec2-base

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 95.0M 362.0 MB 2020-10-26 eng-Latn
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

facebook/wav2vec2-base-960h

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 95.0M 360.0 MB 2020-10-26 eng-Latn
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

facebook/wav2vec2-large

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 317.0M 1.2 GB 2020-10-26 eng-Latn
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

facebook/wav2vec2-large-xlsr-53

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 317.0M 1.2 GB 2020-10-26 eng-Latn
Citation
@misc{conneau2020unsupervisedcrosslingualrepresentationlearning,
      title={Unsupervised Cross-lingual Representation Learning for Speech Recognition},
      author={Alexis Conneau and Alexei Baevski and Ronan Collobert and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.13979},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.13979},
}

facebook/wav2vec2-lv-60-espeak-cv-ft

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 317.0M 1.2 GB 2020-10-26 eng-Latn
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

facebook/wav2vec2-xls-r-1b

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 1.0B 4.4 GB 2024-09-10 abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

facebook/wav2vec2-xls-r-2b

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 2.0B 8.8 GB 2024-09-10 abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

facebook/wav2vec2-xls-r-2b-21-to-en

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 2.0B 9.0 GB 2024-09-10 abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

facebook/wav2vec2-xls-r-300m

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 300.0M 1.2 GB 2021-10-13 abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

google/vggish

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 128 72.1M 275.0 MB 2019-06-13 eng-Latn
Citation
@inproceedings{hershey2017cnn,
    author = {Hershey, Shawn and Chaudhuri, Sourish and Ellis, Daniel P. W. and Gemmeke, Jort F. and Jansen, Aren and Moore, R. Channing and Plakal, Manoj and Platt, Devin and Saurous, Rif A. and Seybold, Bryan and Slaney, Malcolm and Weiss, Ron J. and Wilson, Kevin},
    title = {CNN architectures for large-scale audio classification},
    year = {2017},
    publisher = {IEEE Press},
    url = {https://doi.org/10.1109/ICASSP.2017.7952132},
    doi = {10.1109/ICASSP.2017.7952132},
    booktitle = {2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
    pages = {131–135},
    numpages = {5},
    location = {New Orleans, LA, USA}
}

google/yamnet

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 3.8M 14.0 MB 2020-10-06 eng-Latn
Citation
@inproceedings{audioset,
  title={Audio set: An ontology and human-labeled dataset for audio events},
  author={Gemmeke, Jort F and Ellis, Daniel PW and Freedman, Dylan and Jansen, Aren and Lawrence, Wade and Moore, R Channing and Plakal, Manoj and Ritter, Marvin},
  booktitle={2017 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
  pages={776--780},
  year={2017},
  organization={IEEE}
}

microsoft/speecht5_asr

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 768 151.6M 578.0 MB 2022-05-16 eng-Latn
Citation
@misc{ao2022speecht5unifiedmodalencoderdecoderpretraining,
      title={SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing},
      author={Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
      year={2022},
      eprint={2110.07205},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2110.07205},
}

microsoft/unispeech-sat-base-100h-libri-ft

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 94.0M 359.0 MB 2021-10-12 eng-Latn
Citation
@misc{chen2021unispeechsatuniversalspeechrepresentation,
      title={UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware Pre-Training},
      author={Sanyuan Chen and Yu Wu and Chengyi Wang and Zhengyang Chen and Zhuo Chen and Shujie Liu and Jian Wu and Yao Qian and Furu Wei and Jinyu Li and Xiangzhan Yu},
      year={2021},
      eprint={2110.05752},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2110.05752},
}

microsoft/wavlm-base

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 94.7M 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-base-plus

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 94.7M 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-base-plus-sd

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 94.7M 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-base-plus-sv

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 94.7M 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-base-sd

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 94.7M 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-base-sv

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 94.7M 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-large

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 316.6M 1.2 GB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

openai/whisper-base

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 512 74.0M 277.0 MB 2022-09-27 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)
Citation
@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

openai/whisper-large-v3

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1280 1.6B 5.7 GB 2022-09-27 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)
Citation
@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

openai/whisper-medium

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 769.0M 2.8 GB 2022-09-27 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)
Citation
@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

openai/whisper-small

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 768 244.0M 922.0 MB 2022-09-27 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)
Citation
@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

openai/whisper-tiny

License: mit

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 512 39.0M 144.0 MB 2022-09-27 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)
Citation
@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

speechbrain/cnn14-esc50

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 2048 80.8M 308.0 MB 2022-11-26 eng-Latn
Citation
@inproceedings{wang2022CRL,
    title={Learning Representations for New Sound Classes With Continual Self-Supervised Learning},
    author={Zhepei Wang, Cem Subakan, Xilin Jiang, Junkai Wu, Efthymios Tzinis, Mirco Ravanelli, Paris Smaragdis},
    year={2022},
    booktitle={Accepted to IEEE Signal Processing Letters}
}

speechbrain/m-ctc-t-large

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
not specified 1536 1.1B 3.9 GB 2022-01-10 abk-Cyrl, ara-Arab, asm-Beng, bre-Latn, cat-Latn, ... (58)
Citation
@misc{lugosch2022pseudolabelingmassivelymultilingualspeech,
      title={Pseudo-Labeling for Massively Multilingual Speech Recognition},
      author={Loren Lugosch and Tatiana Likhomanenko and Gabriel Synnaeve and Ronan Collobert},
      year={2022},
      eprint={2111.00161},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.00161},
}

vitouphy/wav2vec2-xls-r-300m-phoneme

License: apache-2.0

Max Tokens Embedding dimension Parameters Required Memory (Mb) Release date Languages
infP 1024 300.0M 1.2 GB 2022-05-19 eng-Latn
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}