Audio Model¶

Number of models: 41

Non-instruction Model¶

`MIT/ast-finetuned-audioset-10-10-0.4593`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	768	86.6M	330.0 MB	2021-07-08	eng-Latn

Citation

@misc{gong2021astaudiospectrogramtransformer,
      title={AST: Audio Spectrogram Transformer},
      author={Yuan Gong and Yu-An Chung and James Glass},
      year={2021},
      eprint={2104.01778},
      archivePrefix={arXiv},
      primaryClass={cs.SD},
      url={https://arxiv.org/abs/2104.01778},
}

`asapp/sew-d-base-plus-400k-ft-ls100h`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	95.0M	675.0 MB	2021-09-14	eng-Latn

Citation

@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
      title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
      author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
      year={2021},
      eprint={2109.06870},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2109.06870},
}

`asapp/sew-d-mid-400k-ft-ls100h`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	139.0M	530.0 MB	2021-09-14	eng-Latn

Citation

@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
      title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
      author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
      year={2021},
      eprint={2109.06870},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2109.06870},
}

`asapp/sew-d-tiny-100k-ft-ls100h`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	256	19.7M	92.0 MB	2021-09-14	eng-Latn

Citation

@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
      title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
      author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
      year={2021},
      eprint={2109.06870},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2109.06870},
}

`facebook/data2vec-audio-base-960h`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	768	93.2M	355.0 MB	2022-02-07	eng-Latn

Citation

@misc{baevski2022data2vecgeneralframeworkselfsupervised,
    title={data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language},
    author={Alexei Baevski and Wei-Ning Hsu and Qiantong Xu and Arun Babu and Jiatao Gu and Michael Auli},
    year={2022},
    eprint={2202.03555},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2202.03555},
}

`facebook/data2vec-audio-large-960h`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	1024	313.3M	1.2 GB	2022-02-07	eng-Latn

Citation

@misc{baevski2022data2vecgeneralframeworkselfsupervised,
    title={data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language},
    author={Alexei Baevski and Wei-Ning Hsu and Qiantong Xu and Arun Babu and Jiatao Gu and Michael Auli},
    year={2022},
    eprint={2202.03555},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2202.03555},
}

`facebook/encodec_24khz`¶

License: cc-by-nc-4.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	128	23.3M	88.0 MB	2022-10-25	eng-Latn

Citation

@misc{défossez2022highfidelityneuralaudio,
      title={High Fidelity Neural Audio Compression},
      author={Alexandre Défossez and Jade Copet and Gabriel Synnaeve and Yossi Adi},
      year={2022},
      eprint={2210.13438},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2210.13438},
}

`facebook/hubert-base-ls960`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	95.0M	360.0 MB	2021-06-14	eng-Latn

Citation

@misc{hsu2021hubertselfsupervisedspeechrepresentation,
    title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
    author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
    year={2021},
    eprint={2106.07447},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2106.07447},
}

`facebook/hubert-large-ls960-ft`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	317.0M	1.2 GB	2021-06-14	eng-Latn

Citation

@misc{hsu2021hubertselfsupervisedspeechrepresentation,
    title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
    author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
    year={2021},
    eprint={2106.07447},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2106.07447},
}

`facebook/mms-1b-all`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	1024	1.0B	3.6 GB	2023-05-22	ara-Arab, cmn-Hans, deu-Latn, eng-Latn, fra-Latn, ... (7)

Citation

@misc{pratap2023scalingspeechtechnology1000,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  year={2023},
  eprint={2305.13516},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2305.13516},
}

`facebook/mms-1b-fl102`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	1024	1.0B	3.6 GB	2023-05-22	eng-Latn

Citation

@misc{pratap2023scalingspeechtechnology1000,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  year={2023},
  eprint={2305.13516},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2305.13516},
}

`facebook/mms-1b-l1107`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	1024	1.0B	3.6 GB	2023-05-22	eng-Latn

Citation

@misc{pratap2023scalingspeechtechnology1000,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  year={2023},
  eprint={2305.13516},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2305.13516},
}

`facebook/seamless-m4t-v2-large`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	1024	2.3B	8.6 GB	2023-11-06	eng-Latn

Citation

@misc{communication2023seamlessmultilingualexpressivestreaming,
      title={Seamless: Multilingual Expressive and Streaming Speech Translation},
      author={Seamless Communication and Loïc Barrault and Yu-An Chung and Mariano Coria Meglioli and David Dale and Ning Dong and Mark Duppenthaler and Paul-Ambroise Duquenne and Brian Ellis and Hady Elsahar and Justin Haaheim and John Hoffman and Min-Jae Hwang and Hirofumi Inaguma and Christopher Klaiber and Ilia Kulikov and Pengwei Li and Daniel Licht and Jean Maillard and Ruslan Mavlyutov and Alice Rakotoarison and Kaushik Ram Sadagopan and Abinesh Ramakrishnan and Tuan Tran and Guillaume Wenzek and Yilin Yang and Ethan Ye and Ivan Evtimov and Pierre Fernandez and Cynthia Gao and Prangthip Hansanti and Elahe Kalbassi and Amanda Kallet and Artyom Kozhevnikov and Gabriel Mejia Gonzalez and Robin San Roman and Christophe Touret and Corinne Wong and Carleigh Wood and Bokai Yu and Pierre Andrews and Can Balioglu and Peng-Jen Chen and Marta R. Costa-jussà and Maha Elbayad and Hongyu Gong and Francisco Guzmán and Kevin Heffernan and Somya Jain and Justine Kao and Ann Lee and Xutai Ma and Alex Mourachko and Benjamin Peloquin and Juan Pino and Sravya Popuri and Christophe Ropers and Safiyyah Saleem and Holger Schwenk and Anna Sun and Paden Tomasello and Changhan Wang and Jeff Wang and Skyler Wang and Mary Williamson},
      year={2023},
      eprint={2312.05187},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2312.05187},
}

`facebook/wav2vec2-base`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	95.0M	362.0 MB	2020-10-26	eng-Latn

Citation

@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

`facebook/wav2vec2-base-960h`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	95.0M	360.0 MB	2020-10-26	eng-Latn

Citation

@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

`facebook/wav2vec2-large`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	317.0M	1.2 GB	2020-10-26	eng-Latn

Citation

@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

`facebook/wav2vec2-large-xlsr-53`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	317.0M	1.2 GB	2020-10-26	eng-Latn

Citation

@misc{conneau2020unsupervisedcrosslingualrepresentationlearning,
      title={Unsupervised Cross-lingual Representation Learning for Speech Recognition},
      author={Alexis Conneau and Alexei Baevski and Ronan Collobert and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.13979},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.13979},
}

`facebook/wav2vec2-lv-60-espeak-cv-ft`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	317.0M	1.2 GB	2020-10-26	eng-Latn

Citation

@misc{baevski2020wav2vec20frameworkselfsupervised,
      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2006.11477},
}

`facebook/wav2vec2-xls-r-1b`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	1.0B	4.4 GB	2024-09-10	abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)

Citation

@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

`facebook/wav2vec2-xls-r-2b`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	2.0B	8.8 GB	2024-09-10	abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)

Citation

@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

`facebook/wav2vec2-xls-r-2b-21-to-en`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	2.0B	9.0 GB	2024-09-10	abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)

Citation

@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

`facebook/wav2vec2-xls-r-300m`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	300.0M	1.2 GB	2021-10-13	abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55)

Citation

@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}

`google/vggish`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	128	72.1M	275.0 MB	2019-06-13	eng-Latn

Citation

@inproceedings{hershey2017cnn,
    author = {Hershey, Shawn and Chaudhuri, Sourish and Ellis, Daniel P. W. and Gemmeke, Jort F. and Jansen, Aren and Moore, R. Channing and Plakal, Manoj and Platt, Devin and Saurous, Rif A. and Seybold, Bryan and Slaney, Malcolm and Weiss, Ron J. and Wilson, Kevin},
    title = {CNN architectures for large-scale audio classification},
    year = {2017},
    publisher = {IEEE Press},
    url = {https://doi.org/10.1109/ICASSP.2017.7952132},
    doi = {10.1109/ICASSP.2017.7952132},
    booktitle = {2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
    pages = {131–135},
    numpages = {5},
    location = {New Orleans, LA, USA}
}

`google/yamnet`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	3.8M	14.0 MB	2020-10-06	eng-Latn

Citation

@inproceedings{audioset,
  title={Audio set: An ontology and human-labeled dataset for audio events},
  author={Gemmeke, Jort F and Ellis, Daniel PW and Freedman, Dylan and Jansen, Aren and Lawrence, Wade and Moore, R Channing and Plakal, Manoj and Ritter, Marvin},
  booktitle={2017 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
  pages={776--780},
  year={2017},
  organization={IEEE}
}

`microsoft/speecht5_asr`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	768	151.6M	578.0 MB	2022-05-16	eng-Latn

Citation

@misc{ao2022speecht5unifiedmodalencoderdecoderpretraining,
      title={SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing},
      author={Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
      year={2022},
      eprint={2110.07205},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2110.07205},
}

`microsoft/unispeech-sat-base-100h-libri-ft`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	94.0M	359.0 MB	2021-10-12	eng-Latn

Citation

@misc{chen2021unispeechsatuniversalspeechrepresentation,
      title={UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware Pre-Training},
      author={Sanyuan Chen and Yu Wu and Chengyi Wang and Zhengyang Chen and Zhuo Chen and Shujie Liu and Jian Wu and Yao Qian and Furu Wei and Jinyu Li and Xiangzhan Yu},
      year={2021},
      eprint={2110.05752},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2110.05752},
}

`microsoft/wavlm-base`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	94.7M	361.0 MB	2022-07-19	eng-Latn

Citation

@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

`microsoft/wavlm-base-plus`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	94.7M	361.0 MB	2022-07-19	eng-Latn

Citation

@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

`microsoft/wavlm-base-plus-sd`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	94.7M	361.0 MB	2022-07-19	eng-Latn

Citation

@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

`microsoft/wavlm-base-plus-sv`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	94.7M	361.0 MB	2022-07-19	eng-Latn

Citation

@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

`microsoft/wavlm-base-sd`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	94.7M	361.0 MB	2022-07-19	eng-Latn

Citation

@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

`microsoft/wavlm-base-sv`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	94.7M	361.0 MB	2022-07-19	eng-Latn

Citation

@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

`microsoft/wavlm-large`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	316.6M	1.2 GB	2022-07-19	eng-Latn

Citation

@article{Chen_2022,
   title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
   volume={16},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
   DOI={10.1109/jstsp.2022.3188113},
   number={6},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
   year={2022},
   month=oct, pages={1505–1518} }

`openai/whisper-base`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	512	74.0M	277.0 MB	2022-09-27	afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)

Citation

@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

`openai/whisper-large-v3`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1280	1.6B	5.7 GB	2022-09-27	afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)

Citation

@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

`openai/whisper-medium`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	769.0M	2.8 GB	2022-09-27	afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)

Citation

@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

`openai/whisper-small`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	768	244.0M	922.0 MB	2022-09-27	afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)

Citation

@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

`openai/whisper-tiny`¶

License: mit

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	512	39.0M	144.0 MB	2022-09-27	afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99)

Citation

@misc{radford2022robustspeechrecognitionlargescale,
      title={Robust Speech Recognition via Large-Scale Weak Supervision},
      author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
      year={2022},
      eprint={2212.04356},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2212.04356},
}

`speechbrain/cnn14-esc50`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	2048	80.8M	308.0 MB	2022-11-26	eng-Latn

Citation

@inproceedings{wang2022CRL,
    title={Learning Representations for New Sound Classes With Continual Self-Supervised Learning},
    author={Zhepei Wang, Cem Subakan, Xilin Jiang, Junkai Wu, Efthymios Tzinis, Mirco Ravanelli, Paris Smaragdis},
    year={2022},
    booktitle={Accepted to IEEE Signal Processing Letters}
}

`speechbrain/m-ctc-t-large`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
not specified	1536	1.1B	3.9 GB	2022-01-10	abk-Cyrl, ara-Arab, asm-Beng, bre-Latn, cat-Latn, ... (58)

Citation

@misc{lugosch2022pseudolabelingmassivelymultilingualspeech,
      title={Pseudo-Labeling for Massively Multilingual Speech Recognition},
      author={Loren Lugosch and Tatiana Likhomanenko and Gabriel Synnaeve and Ronan Collobert},
      year={2022},
      eprint={2111.00161},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.00161},
}

`vitouphy/wav2vec2-xls-r-300m-phoneme`¶

License: apache-2.0

Max Tokens	Embedding dimension	Parameters	Required Memory (Mb)	Release date	Languages
infP	1024	300.0M	1.2 GB	2022-05-19	eng-Latn

Citation

@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
      title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
      author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
      year={2021},
      eprint={2111.09296},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2111.09296},
}