Skip to content

Audio Model

41 Models

Non-instruction Model

MIT/ast-finetuned-audioset-10-10-0.4593

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
86.6M 768 not specified 330.0 MB 2021-07-08 eng-Latn
Citation
@misc{gong2021astaudiospectrogramtransformer,
      title={AST: Audio Spectrogram Transformer},
      author={Yuan Gong and Yu-An Chung and James Glass},
      year={2021},
      eprint={2104.01778},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2104.01778},
}

asapp/sew-d-base-plus-400k-ft-ls100h

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
95.0M 768 Infinite 675.0 MB 2021-09-14 eng-Latn
Citation
@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
      title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
      author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
      year={2021},
      eprint={2109.06870},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2109.06870},
}

asapp/sew-d-mid-400k-ft-ls100h

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
139.0M 768 Infinite 530.0 MB 2021-09-14 eng-Latn
Citation
@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
      title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
      author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
      year={2021},
      eprint={2109.06870},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2109.06870},
}

asapp/sew-d-tiny-100k-ft-ls100h

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
19.7M 256 Infinite 92.0 MB 2021-09-14 eng-Latn
Citation
@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
      title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
      author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
      year={2021},
      eprint={2109.06870},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2109.06870},
}

facebook/data2vec-audio-base-960h

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
93.2M 768 not specified 355.0 MB 2022-02-07 eng-Latn
Citation
@misc{baevski2022data2vecgeneralframeworkselfsupervised,
    title={data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language},
    author={Alexei Baevski and Wei-Ning Hsu and Qiantong Xu and Arun Babu and Jiatao Gu and Michael Auli},
    year={2022},
    eprint={2202.03555},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2202.03555},
}

facebook/data2vec-audio-large-960h

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
313.3M 1024 not specified 1.2 GB 2022-02-07 eng-Latn
Citation
@misc{baevski2022data2vecgeneralframeworkselfsupervised,
    title={data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language},
    author={Alexei Baevski and Wei-Ning Hsu and Qiantong Xu and Arun Babu and Jiatao Gu and Michael Auli},
    year={2022},
    eprint={2202.03555},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2202.03555},
}

facebook/encodec_24khz

License: cc-by-nc-4.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
23.3M 128 not specified 88.0 MB 2022-10-25 eng-Latn
Citation
@misc{défossez2022highfidelityneuralaudio,
      title={High Fidelity Neural Audio Compression},
      author={Alexandre Défossez and Jade Copet and Gabriel Synnaeve and Yossi Adi},
      year={2022},
      eprint={2210.13438},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2210.13438},
}

facebook/hubert-base-ls960

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
95.0M 768 Infinite 360.0 MB 2021-06-14 eng-Latn
Citation
@misc{hsu2021hubertselfsupervisedspeechrepresentation,
    title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
    author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
    year={2021},
    eprint={2106.07447},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2106.07447},
}

facebook/hubert-large-ls960-ft

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
317.0M 1024 Infinite 1.2 GB 2021-06-14 eng-Latn
Citation
@misc{hsu2021hubertselfsupervisedspeechrepresentation,
    title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
    author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
    year={2021},
    eprint={2106.07447},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2106.07447},
}

facebook/mms-1b-all

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
1.0B 1024 not specified 3.6 GB 2023-05-22 ara-Arab, cmn-Hans, deu-Latn, eng-Latn, fra-Latn, ... (7)
Citation
@misc{pratap2023scalingspeechtechnology1000,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  year={2023},
  eprint={2305.13516},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2305.13516},
}

facebook/mms-1b-fl102

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
1.0B 1024 not specified 3.6 GB 2023-05-22 eng-Latn
Citation
@misc{pratap2023scalingspeechtechnology1000,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  year={2023},
  eprint={2305.13516},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2305.13516},
}

facebook/mms-1b-l1107

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
1.0B 1024 not specified 3.6 GB 2023-05-22 eng-Latn
Citation
@misc{pratap2023scalingspeechtechnology1000,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  year={2023},
  eprint={2305.13516},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2305.13516},
}

facebook/seamless-m4t-v2-large

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.3B 1024 not specified 8.6 GB 2023-11-06 eng-Latn
Citation
@misc{communication2023seamlessmultilingualexpressivestreaming,
      title={Seamless: Multilingual Expressive and Streaming Speech Translation},
      author={Seamless Communication and Loïc Barrault and Yu-An Chung and Mariano Coria Meglioli and David Dale and Ning Dong and Mark Duppenthaler and Paul-Ambroise Duquenne and Brian Ellis and Hady Elsahar and Justin Haaheim and John Hoffman and Min-Jae Hwang and Hirofumi Inaguma and Christopher Klaiber and Ilia Kulikov and Pengwei Li and Daniel Licht and Jean Maillard and Ruslan Mavlyutov and Alice Rakotoarison and Kaushik Ram Sadagopan and Abinesh Ramakrishnan and Tuan Tran and Guillaume Wenzek and Yilin Yang and Ethan Ye and Ivan Evtimov and Pierre Fernandez and Cynthia Gao and Prangthip Hansanti and Elahe Kalbassi and Amanda Kallet and Artyom Kozhevnikov and Gabriel Mejia Gonzalez and Robin San Roman and Christophe Touret and Corinne Wong and Carleigh Wood and Bokai Yu and Pierre Andrews and Can Balioglu and Peng-Jen Chen and Marta R. Costa-jussà and Maha Elbayad and Hongyu Gong and Francisco Guzmán and Kevin Heffernan and Somya Jain and Justine Kao and Ann Lee and Xutai Ma and Alex Mourachko and Benjamin Peloquin and Juan Pino and Sravya Popuri and Christophe Ropers and Safiyyah Saleem and Holger Schwenk and Anna Sun and Paden Tomasello and Changhan Wang and Jeff Wang and Skyler Wang and Mary Williamson},
      year={2023},
      eprint={2312.05187},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2312.05187},
}

facebook/wav2vec2-base

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
95.0M 768 Infinite 362.0 MB 2020-10-26 eng-Latn
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

facebook/wav2vec2-base-960h

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
95.0M 768 Infinite 360.0 MB 2020-10-26 eng-Latn
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

facebook/wav2vec2-large

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
317.0M 1024 Infinite 1.2 GB 2020-10-26 eng-Latn
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou with Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

facebook/wav2vec2-large-xlsr-53

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
317.0M 1024 Infinite 1.2 GB 2020-10-26 eng-Latn
Citation
@misc{conneau2020unsupervisedcrosslingualrepresentationlearning,
      title={Unsupervised Cross-lingual Representation Learning for Speech Recognition},
      author={Alexis Conneau and Alexei Baevski and Ronan Collobert and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.13979},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.13979},
}

facebook/wav2vec2-lv-60-espeak-cv-ft

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
317.0M 1024 Infinite 1.2 GB 2020-10-26 eng-Latn
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

facebook/wav2vec2-xls-r-1b

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
1.0B 1024 Infinite 4.4 GB 2024-09-10 abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

facebook/wav2vec2-xls-r-2b

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.0B 1024 Infinite 8.8 GB 2024-09-10 abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

facebook/wav2vec2-xls-r-2b-21-to-en

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
2.0B 1024 Infinite 9.0 GB 2024-09-10 abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

facebook/wav2vec2-xls-r-300m

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
300.0M 1024 Infinite 1.2 GB 2021-10-13 abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

google/vggish

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
72.1M 128 Infinite 275.0 MB 2019-06-13 eng-Latn
Citation
@inproceedings{hershey2017cnn,
    author = {Hershey, Shawn and Chaudhuri, Sourish and Ellis, Daniel P. W. and Gemmeke, Jort F. and Jansen, Aren and Moore, R. Channing and Plakal, Manoj and Platt, Devin and Saurous, Rif A. and Seybold, Bryan and Slaney, Malcolm and Weiss, Ron J. and Wilson, Kevin},
    title = {CNN architectures for large-scale audio classification},
    year = {2017},
    publisher = {IEEE Press},
    url = {https://doi.org/10.1109/ICASSP.2017.7952132},
    doi = {10.1109/ICASSP.2017.7952132},
    booktitle = {2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
    pages = {131–135},
    numpages = {5},
    location = {New Orleans, LA, USA}
}

google/yamnet

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
3.8M 1024 Infinite 14.0 MB 2020-10-06 eng-Latn
Citation
@inproceedings{audioset,
  title={Audio set: An ontology and human-labeled dataset for audio events},
  author={Gemmeke, Jort F and Ellis, Daniel PW and Freedman, Dylan and Jansen, Aren and Lawrence, Wade and Moore, R Channing and Plakal, Manoj and Ritter, Marvin},
  booktitle={2017 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
  pages={776--780},
  year={2017},
  organization={IEEE}
}

microsoft/speecht5_asr

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
151.6M 768 not specified 578.0 MB 2022-05-16 eng-Latn
Citation
@misc{ao2022speecht5unifiedmodalencoderdecoderpretraining,
      title={SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing},
      author={Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
      year={2022},
      eprint={2110.07205},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2110.07205},
}

microsoft/unispeech-sat-base-100h-libri-ft

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
94.0M 768 Infinite 359.0 MB 2021-10-12 eng-Latn
Citation
@misc{chen2021unispeechsatuniversalspeechrepresentation,
      title={UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware Pre-Training},
      author={Sanyuan Chen and Yu Wu and Chengyi Wang and Zhengyang Chen and Zhuo Chen and Shujie Liu and Jian Wu and Yao Qian and Furu Wei and Jinyu Li and Xiangzhan Yu},
      year={2021},
      eprint={2110.05752},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2110.05752},
}

microsoft/wavlm-base

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
94.7M 768 Infinite 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-base-plus

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
94.7M 768 Infinite 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-base-plus-sd

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
94.7M 768 Infinite 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-base-plus-sv

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
94.7M 768 Infinite 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-base-sd

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
94.7M 768 Infinite 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-base-sv

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
94.7M 768 Infinite 361.0 MB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

microsoft/wavlm-large

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
316.6M 1024 Infinite 1.2 GB 2022-07-19 eng-Latn
Citation
@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

openai/whisper-base

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
74.0M 512 Infinite 277.0 MB 2022-09-27 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)
Citation
@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

openai/whisper-large-v3

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
1.6B 1280 Infinite 5.7 GB 2022-09-27 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)
Citation
@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

openai/whisper-medium

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
769.0M 1024 Infinite 2.8 GB 2022-09-27 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)
Citation
@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

openai/whisper-small

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
244.0M 768 Infinite 922.0 MB 2022-09-27 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)
Citation
@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

openai/whisper-tiny

License: mit • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
39.0M 512 Infinite 144.0 MB 2022-09-27 afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)
Citation
@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

speechbrain/cnn14-esc50

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
80.8M 2048 not specified 308.0 MB 2022-11-26 eng-Latn
Citation
@inproceedings{wang2022CRL,
    title={Learning Representations for New Sound Classes With Continual Self-Supervised Learning},
    author={Zhepei Wang, Cem Subakan, Xilin Jiang, Junkai Wu, Efthymios Tzinis, Mirco Ravanelli, Paris Smaragdis},
    year={2022},
    booktitle={Accepted to IEEE Signal Processing Letters}
}

speechbrain/m-ctc-t-large

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
1.1B 1536 not specified 3.9 GB 2022-01-10 abk-Cyrl, ara-Arab, asm-Beng, bre-Latn, cat-Latn, ... (58)
Citation
@misc{lugosch2022pseudolabelingmassivelymultilingualspeech,
      title={Pseudo-Labeling for Massively Multilingual Speech Recognition},
      author={Loren Lugosch and Tatiana Likhomanenko and Gabriel Synnaeve and Ronan Collobert},
      year={2022},
      eprint={2111.00161},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.00161},
}

vitouphy/wav2vec2-xls-r-300m-phoneme

License: apache-2.0 • Learn more →

Parameters Emb. Dim Max Tokens Memory Released Languages
300.0M 1024 Infinite 1.2 GB 2022-05-19 eng-Latn
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}