Audio Model¶
- Number of models: 41
Non-instruction Model¶
MIT/ast-finetuned-audioset-10-10-0.4593¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 768 | 86.6M | 330.0 MB | 2021-07-08 | eng-Latn |
Citation
@misc{gong2021astaudiospectrogramtransformer,
title={AST: Audio Spectrogram Transformer},
author={Yuan Gong and Yu-An Chung and James Glass},
year={2021},
eprint={2104.01778},
archivePrefix={arXiv},
primaryClass={cs.SD},
url={https://arxiv.org/abs/2104.01778},
}
asapp/sew-d-base-plus-400k-ft-ls100h¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 95.0M | 675.0 MB | 2021-09-14 | eng-Latn |
Citation
@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
year={2021},
eprint={2109.06870},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2109.06870},
}
asapp/sew-d-mid-400k-ft-ls100h¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 139.0M | 530.0 MB | 2021-09-14 | eng-Latn |
Citation
@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
year={2021},
eprint={2109.06870},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2109.06870},
}
asapp/sew-d-tiny-100k-ft-ls100h¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 256 | 19.7M | 92.0 MB | 2021-09-14 | eng-Latn |
Citation
@misc{wu2021performanceefficiencytradeoffsunsupervisedpretraining,
title={Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition},
author={Felix Wu and Kwangyoun Kim and Jing Pan and Kyu Han and Kilian Q. Weinberger and Yoav Artzi},
year={2021},
eprint={2109.06870},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2109.06870},
}
facebook/data2vec-audio-base-960h¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 768 | 93.2M | 355.0 MB | 2022-02-07 | eng-Latn |
Citation
@misc{baevski2022data2vecgeneralframeworkselfsupervised,
title={data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language},
author={Alexei Baevski and Wei-Ning Hsu and Qiantong Xu and Arun Babu and Jiatao Gu and Michael Auli},
year={2022},
eprint={2202.03555},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2202.03555},
}
facebook/data2vec-audio-large-960h¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 313.3M | 1.2 GB | 2022-02-07 | eng-Latn |
Citation
@misc{baevski2022data2vecgeneralframeworkselfsupervised,
title={data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language},
author={Alexei Baevski and Wei-Ning Hsu and Qiantong Xu and Arun Babu and Jiatao Gu and Michael Auli},
year={2022},
eprint={2202.03555},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2202.03555},
}
facebook/encodec_24khz¶
License: cc-by-nc-4.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 128 | 23.3M | 88.0 MB | 2022-10-25 | eng-Latn |
Citation
@misc{défossez2022highfidelityneuralaudio,
title={High Fidelity Neural Audio Compression},
author={Alexandre Défossez and Jade Copet and Gabriel Synnaeve and Yossi Adi},
year={2022},
eprint={2210.13438},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2210.13438},
}
facebook/hubert-base-ls960¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 95.0M | 360.0 MB | 2021-06-14 | eng-Latn |
Citation
@misc{hsu2021hubertselfsupervisedspeechrepresentation,
title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
year={2021},
eprint={2106.07447},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2106.07447},
}
facebook/hubert-large-ls960-ft¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 317.0M | 1.2 GB | 2021-06-14 | eng-Latn |
Citation
@misc{hsu2021hubertselfsupervisedspeechrepresentation,
title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
year={2021},
eprint={2106.07447},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2106.07447},
}
facebook/mms-1b-all¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 1.0B | 3.6 GB | 2023-05-22 | ara-Arab, cmn-Hans, deu-Latn, eng-Latn, fra-Latn, ... (7) |
Citation
@misc{pratap2023scalingspeechtechnology1000,
title={Scaling Speech Technology to 1,000+ Languages},
author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
year={2023},
eprint={2305.13516},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2305.13516},
}
facebook/mms-1b-fl102¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 1.0B | 3.6 GB | 2023-05-22 | eng-Latn |
Citation
@misc{pratap2023scalingspeechtechnology1000,
title={Scaling Speech Technology to 1,000+ Languages},
author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
year={2023},
eprint={2305.13516},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2305.13516},
}
facebook/mms-1b-l1107¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 1.0B | 3.6 GB | 2023-05-22 | eng-Latn |
Citation
@misc{pratap2023scalingspeechtechnology1000,
title={Scaling Speech Technology to 1,000+ Languages},
author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
year={2023},
eprint={2305.13516},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2305.13516},
}
facebook/seamless-m4t-v2-large¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1024 | 2.3B | 8.6 GB | 2023-11-06 | eng-Latn |
Citation
@misc{communication2023seamlessmultilingualexpressivestreaming,
title={Seamless: Multilingual Expressive and Streaming Speech Translation},
author={Seamless Communication and Loïc Barrault and Yu-An Chung and Mariano Coria Meglioli and David Dale and Ning Dong and Mark Duppenthaler and Paul-Ambroise Duquenne and Brian Ellis and Hady Elsahar and Justin Haaheim and John Hoffman and Min-Jae Hwang and Hirofumi Inaguma and Christopher Klaiber and Ilia Kulikov and Pengwei Li and Daniel Licht and Jean Maillard and Ruslan Mavlyutov and Alice Rakotoarison and Kaushik Ram Sadagopan and Abinesh Ramakrishnan and Tuan Tran and Guillaume Wenzek and Yilin Yang and Ethan Ye and Ivan Evtimov and Pierre Fernandez and Cynthia Gao and Prangthip Hansanti and Elahe Kalbassi and Amanda Kallet and Artyom Kozhevnikov and Gabriel Mejia Gonzalez and Robin San Roman and Christophe Touret and Corinne Wong and Carleigh Wood and Bokai Yu and Pierre Andrews and Can Balioglu and Peng-Jen Chen and Marta R. Costa-jussà and Maha Elbayad and Hongyu Gong and Francisco Guzmán and Kevin Heffernan and Somya Jain and Justine Kao and Ann Lee and Xutai Ma and Alex Mourachko and Benjamin Peloquin and Juan Pino and Sravya Popuri and Christophe Ropers and Safiyyah Saleem and Holger Schwenk and Anna Sun and Paden Tomasello and Changhan Wang and Jeff Wang and Skyler Wang and Mary Williamson},
year={2023},
eprint={2312.05187},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2312.05187},
}
facebook/wav2vec2-base¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 95.0M | 362.0 MB | 2020-10-26 | eng-Latn |
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
year={2020},
eprint={2006.11477},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2006.11477},
}
facebook/wav2vec2-base-960h¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 95.0M | 360.0 MB | 2020-10-26 | eng-Latn |
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
year={2020},
eprint={2006.11477},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2006.11477},
}
facebook/wav2vec2-large¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 317.0M | 1.2 GB | 2020-10-26 | eng-Latn |
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
year={2020},
eprint={2006.11477},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2006.11477},
}
facebook/wav2vec2-large-xlsr-53¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 317.0M | 1.2 GB | 2020-10-26 | eng-Latn |
Citation
@misc{conneau2020unsupervisedcrosslingualrepresentationlearning,
title={Unsupervised Cross-lingual Representation Learning for Speech Recognition},
author={Alexis Conneau and Alexei Baevski and Ronan Collobert and Abdelrahman Mohamed and Michael Auli},
year={2020},
eprint={2006.13979},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2006.13979},
}
facebook/wav2vec2-lv-60-espeak-cv-ft¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 317.0M | 1.2 GB | 2020-10-26 | eng-Latn |
Citation
@misc{baevski2020wav2vec20frameworkselfsupervised,
title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
year={2020},
eprint={2006.11477},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2006.11477},
}
facebook/wav2vec2-xls-r-1b¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 1.0B | 4.4 GB | 2024-09-10 | abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55) |
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
year={2021},
eprint={2111.09296},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2111.09296},
}
facebook/wav2vec2-xls-r-2b¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 2.0B | 8.8 GB | 2024-09-10 | abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55) |
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
year={2021},
eprint={2111.09296},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2111.09296},
}
facebook/wav2vec2-xls-r-2b-21-to-en¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 2.0B | 9.0 GB | 2024-09-10 | abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55) |
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
year={2021},
eprint={2111.09296},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2111.09296},
}
facebook/wav2vec2-xls-r-300m¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 300.0M | 1.2 GB | 2021-10-13 | abk-Cyrl, afr-Latn, amh-Latn, ara-Latn, asm-Latn, ... (55) |
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
year={2021},
eprint={2111.09296},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2111.09296},
}
google/vggish¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 128 | 72.1M | 275.0 MB | 2019-06-13 | eng-Latn |
Citation
@inproceedings{hershey2017cnn,
author = {Hershey, Shawn and Chaudhuri, Sourish and Ellis, Daniel P. W. and Gemmeke, Jort F. and Jansen, Aren and Moore, R. Channing and Plakal, Manoj and Platt, Devin and Saurous, Rif A. and Seybold, Bryan and Slaney, Malcolm and Weiss, Ron J. and Wilson, Kevin},
title = {CNN architectures for large-scale audio classification},
year = {2017},
publisher = {IEEE Press},
url = {https://doi.org/10.1109/ICASSP.2017.7952132},
doi = {10.1109/ICASSP.2017.7952132},
booktitle = {2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {131–135},
numpages = {5},
location = {New Orleans, LA, USA}
}
google/yamnet¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 3.8M | 14.0 MB | 2020-10-06 | eng-Latn |
Citation
@inproceedings{audioset,
title={Audio set: An ontology and human-labeled dataset for audio events},
author={Gemmeke, Jort F and Ellis, Daniel PW and Freedman, Dylan and Jansen, Aren and Lawrence, Wade and Moore, R Channing and Plakal, Manoj and Ritter, Marvin},
booktitle={2017 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
pages={776--780},
year={2017},
organization={IEEE}
}
microsoft/speecht5_asr¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 768 | 151.6M | 578.0 MB | 2022-05-16 | eng-Latn |
Citation
@misc{ao2022speecht5unifiedmodalencoderdecoderpretraining,
title={SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing},
author={Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
year={2022},
eprint={2110.07205},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2110.07205},
}
microsoft/unispeech-sat-base-100h-libri-ft¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 94.0M | 359.0 MB | 2021-10-12 | eng-Latn |
Citation
@misc{chen2021unispeechsatuniversalspeechrepresentation,
title={UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware Pre-Training},
author={Sanyuan Chen and Yu Wu and Chengyi Wang and Zhengyang Chen and Zhuo Chen and Shujie Liu and Jian Wu and Yao Qian and Furu Wei and Jinyu Li and Xiangzhan Yu},
year={2021},
eprint={2110.05752},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2110.05752},
}
microsoft/wavlm-base¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 94.7M | 361.0 MB | 2022-07-19 | eng-Latn |
Citation
@article{Chen_2022,
title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
volume={16},
ISSN={1941-0484},
url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
DOI={10.1109/jstsp.2022.3188113},
number={6},
journal={IEEE Journal of Selected Topics in Signal Processing},
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
year={2022},
month=oct, pages={1505–1518} }
microsoft/wavlm-base-plus¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 94.7M | 361.0 MB | 2022-07-19 | eng-Latn |
Citation
@article{Chen_2022,
title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
volume={16},
ISSN={1941-0484},
url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
DOI={10.1109/jstsp.2022.3188113},
number={6},
journal={IEEE Journal of Selected Topics in Signal Processing},
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
year={2022},
month=oct, pages={1505–1518} }
microsoft/wavlm-base-plus-sd¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 94.7M | 361.0 MB | 2022-07-19 | eng-Latn |
Citation
@article{Chen_2022,
title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
volume={16},
ISSN={1941-0484},
url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
DOI={10.1109/jstsp.2022.3188113},
number={6},
journal={IEEE Journal of Selected Topics in Signal Processing},
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
year={2022},
month=oct, pages={1505–1518} }
microsoft/wavlm-base-plus-sv¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 94.7M | 361.0 MB | 2022-07-19 | eng-Latn |
Citation
@article{Chen_2022,
title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
volume={16},
ISSN={1941-0484},
url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
DOI={10.1109/jstsp.2022.3188113},
number={6},
journal={IEEE Journal of Selected Topics in Signal Processing},
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
year={2022},
month=oct, pages={1505–1518} }
microsoft/wavlm-base-sd¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 94.7M | 361.0 MB | 2022-07-19 | eng-Latn |
Citation
@article{Chen_2022,
title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
volume={16},
ISSN={1941-0484},
url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
DOI={10.1109/jstsp.2022.3188113},
number={6},
journal={IEEE Journal of Selected Topics in Signal Processing},
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
year={2022},
month=oct, pages={1505–1518} }
microsoft/wavlm-base-sv¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 94.7M | 361.0 MB | 2022-07-19 | eng-Latn |
Citation
@article{Chen_2022,
title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
volume={16},
ISSN={1941-0484},
url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
DOI={10.1109/jstsp.2022.3188113},
number={6},
journal={IEEE Journal of Selected Topics in Signal Processing},
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
year={2022},
month=oct, pages={1505–1518} }
microsoft/wavlm-large¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 316.6M | 1.2 GB | 2022-07-19 | eng-Latn |
Citation
@article{Chen_2022,
title={WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing},
volume={16},
ISSN={1941-0484},
url={http://dx.doi.org/10.1109/JSTSP.2022.3188113},
DOI={10.1109/jstsp.2022.3188113},
number={6},
journal={IEEE Journal of Selected Topics in Signal Processing},
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and Wu, Jian and Zhou, Long and Ren, Shuo and Qian, Yanmin and Qian, Yao and Wu, Jian and Zeng, Michael and Yu, Xiangzhan and Wei, Furu},
year={2022},
month=oct, pages={1505–1518} }
openai/whisper-base¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 512 | 74.0M | 277.0 MB | 2022-09-27 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99) |
Citation
@misc{radford2022robustspeechrecognitionlargescale,
title={Robust Speech Recognition via Large-Scale Weak Supervision},
author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
year={2022},
eprint={2212.04356},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2212.04356},
}
openai/whisper-large-v3¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1280 | 1.6B | 5.7 GB | 2022-09-27 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99) |
Citation
@misc{radford2022robustspeechrecognitionlargescale,
title={Robust Speech Recognition via Large-Scale Weak Supervision},
author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
year={2022},
eprint={2212.04356},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2212.04356},
}
openai/whisper-medium¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 769.0M | 2.8 GB | 2022-09-27 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99) |
Citation
@misc{radford2022robustspeechrecognitionlargescale,
title={Robust Speech Recognition via Large-Scale Weak Supervision},
author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
year={2022},
eprint={2212.04356},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2212.04356},
}
openai/whisper-small¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 768 | 244.0M | 922.0 MB | 2022-09-27 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99) |
Citation
@misc{radford2022robustspeechrecognitionlargescale,
title={Robust Speech Recognition via Large-Scale Weak Supervision},
author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
year={2022},
eprint={2212.04356},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2212.04356},
}
openai/whisper-tiny¶
License: mit
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 512 | 39.0M | 144.0 MB | 2022-09-27 | afr-Latn, amh-Ethi, ara-Arab, asm-Beng, aze-Latn, ... (99) |
Citation
@misc{radford2022robustspeechrecognitionlargescale,
title={Robust Speech Recognition via Large-Scale Weak Supervision},
author={Alec Radford and Jong Wook Kim and Tao Xu and Greg Brockman and Christine McLeavey and Ilya Sutskever},
year={2022},
eprint={2212.04356},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2212.04356},
}
speechbrain/cnn14-esc50¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 2048 | 80.8M | 308.0 MB | 2022-11-26 | eng-Latn |
Citation
@inproceedings{wang2022CRL,
title={Learning Representations for New Sound Classes With Continual Self-Supervised Learning},
author={Zhepei Wang, Cem Subakan, Xilin Jiang, Junkai Wu, Efthymios Tzinis, Mirco Ravanelli, Paris Smaragdis},
year={2022},
booktitle={Accepted to IEEE Signal Processing Letters}
}
speechbrain/m-ctc-t-large¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| not specified | 1536 | 1.1B | 3.9 GB | 2022-01-10 | abk-Cyrl, ara-Arab, asm-Beng, bre-Latn, cat-Latn, ... (58) |
Citation
@misc{lugosch2022pseudolabelingmassivelymultilingualspeech,
title={Pseudo-Labeling for Massively Multilingual Speech Recognition},
author={Loren Lugosch and Tatiana Likhomanenko and Gabriel Synnaeve and Ronan Collobert},
year={2022},
eprint={2111.00161},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2111.00161},
}
vitouphy/wav2vec2-xls-r-300m-phoneme¶
License: apache-2.0
| Max Tokens | Embedding dimension | Parameters | Required Memory (Mb) | Release date | Languages |
|---|---|---|---|---|---|
| infP | 1024 | 300.0M | 1.2 GB | 2022-05-19 | eng-Latn |
Citation
@misc{babu2021xlsrselfsupervisedcrosslingualspeech,
title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale},
author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli},
year={2021},
eprint={2111.09296},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2111.09296},
}