@inproceedings{52b347f17ede4a689bd02327f7c461de,
title = "Taco-VC: A single speaker tacotron based voice conversion with limited data",
abstract = "This paper introduces Taco-VC, a novel architecture for voice conversion based on Tacotron synthesizer, which is a sequence-to-sequence with attention model. The training of multi-speaker voice conversion systems requires a large number of resources, both in training and corpus size. Taco-VC is implemented using a single speaker Tacotron synthesizer based on Phonetic PosteriorGrams (PPGs) and a single speaker WaveNet vocoder conditioned on mel spectrograms. To enhance the converted speech quality, and to overcome over-smoothing, the outputs of Tacotron are passed through a novel speech-enhancement network, which is composed of a combination of the phoneme recognition and Tacotron networks. Our system is trained just with a single speaker corpus and adapts to new speakers using only a few minutes of training data. Using mid-size public datasets, our method outperforms the baseline in the VCC 2018 SPOKE non-parallel voice conversion task and achieves competitive results compared to multi-speaker networks trained on large private datasets.",
keywords = "Adaptation, Speech Recognition, Speech Synthesis, Voice Conversion",
author = "Roee Levy-Leshem and Raja Giryes",
note = "Publisher Copyright: {\textcopyright} 2021 European Signal Processing Conference, EUSIPCO. All rights reserved.; 28th European Signal Processing Conference, EUSIPCO 2020 ; Conference date: 24-08-2020 Through 28-08-2020",
year = "2021",
month = jan,
day = "24",
doi = "10.23919/Eusipco47968.2020.9287448",
language = "אנגלית",
series = "European Signal Processing Conference",
publisher = "European Signal Processing Conference, EUSIPCO",
pages = "391--395",
booktitle = "28th European Signal Processing Conference, EUSIPCO 2020 - Proceedings",
}