2024
|
A. Das W. Quamer, R. Gutierrez-Osuna Speech synthesis and pronunciation teaching Book Chapter In: J. Levis C. A. Chapelle, M. Munro; Huensch, A. (Ed.): 2024. @inbook{nokey,
title = {Speech synthesis and pronunciation teaching},
author = {W. Quamer, A. Das, R. Gutierrez-Osuna},
editor = {C. A. Chapelle, J. Levis, M. Munro, C. Nagle, and A. Huensch},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2024/09/book-chapter-preprint.pdf},
year = {2024},
date = {2024-12-31},
urldate = {2024-12-31},
keywords = {},
pubstate = {published},
tppubtype = {inbook}
}
|
Quamer, W.; Gutierrez-Osuna, R. End-to-end streaming model for low-latency speech anonymization Proceedings Article In: Proc. IEEE Spoken Language Technology Workshop (SLT 2024), 2024. @inproceedings{nokey,
title = {End-to-end streaming model for low-latency speech anonymization},
author = {W. Quamer and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2024/09/waris2024slt.pdf},
year = {2024},
date = {2024-12-02},
urldate = {2024-12-02},
booktitle = {Proc. IEEE Spoken Language Technology Workshop (SLT 2024)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Das, A.; R. Gutierrez-Osuna, Improving mispronunciation detection using speech reconstruction Journal Article Forthcoming In: IEEE/ACM Transactions on Audio, Speech and Language Processing, Forthcoming. @article{nokey,
title = {Improving mispronunciation detection using speech reconstruction},
author = {A. Das and R. Gutierrez-Osuna,},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2024/07/anurag-taslp-2024.pdf},
year = {2024},
date = {2024-07-19},
journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
keywords = {},
pubstate = {forthcoming},
tppubtype = {article}
}
|
2023
|
Anurag Das Waris Quamer, Ricardo Gutierrez-Osuna Decoupling segmental and prosodic cues of non-native speech through vector quantization Proceedings Article In: Proc. Interspeech, 2023. @inproceedings{nokey,
title = {Decoupling segmental and prosodic cues of non-native speech through vector quantization},
author = {Waris Quamer, Anurag Das, Ricardo Gutierrez-Osuna
},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2023/11/decouplingProsody2023waris.pdf},
year = {2023},
date = {2023-08-20},
urldate = {2023-08-20},
booktitle = {Proc. Interspeech},
journal = {Interspeech 2023},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2022
|
Quamer, W.; Das, A.; Levis, J.; Chukharev-Hudilainen, E.; Gutierrez-Osuna, R. Zero-Shot Foreign Accent Conversion without a Native Reference Proceedings Article Forthcoming In: Proc. Interspeech, Forthcoming. @inproceedings{waris2022interspeech,
title = {Zero-Shot Foreign Accent Conversion without a Native Reference},
author = {W. Quamer and A. Das and J. Levis and E. Chukharev-Hudilainen and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2023/02/quamer2022interspeech.pdf},
year = {2022},
date = {2022-09-18},
urldate = {2022-09-18},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {forthcoming},
tppubtype = {inproceedings}
}
|
Liberatore, C.; Gutierrez-Osuna, R. Minimizing residuals for native-nonnative voice conversion in a sparse, anchor-based representation of speech Proceedings Article In: Proc. ICASSP, 2022. @inproceedings{liberatore2022icassp,
title = {Minimizing residuals for native-nonnative voice conversion in a sparse, anchor-based representation of speech},
author = {C. Liberatore and R. Gutierrez-Osuna},
year = {2022},
date = {2022-05-22},
booktitle = {Proc. ICASSP},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2021
|
Ding, S.; Zhao, G.; Gutierrez-Osuna, R. Accentron: Foreign accent conversion to arbitrary non-native speakers using zero-shot learning Journal Article In: Computer Speech & Language, 2021. @article{shaojin2021accentron,
title = {Accentron: Foreign accent conversion to arbitrary non-native speakers using zero-shot learning},
author = {S. Ding and G. Zhao and R. Gutierrez-Osuna},
url = {https://www.sciencedirect.com/science/article/pii/S0885230821001029
https://psi.engr.tamu.edu/wp-content/uploads/2021/10/1-s2.0-S0885230821001029-main.pdf},
year = {2021},
date = {2021-10-14},
urldate = {2021-10-14},
journal = {Computer Speech & Language},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
Silpachai, A.; Rehman, I.; Barriuso, T. A.; Levis, J.; Chukharev-Hudilainen, E.; Zhao, G.; Gutierrez-Osuna, R. Effects Of Voice Type And Task On L2 Learners’ Awareness Of Pronunciation Errors Proceedings Article In: Proc. Interspeech, 2021. @inproceedings{alif2021interspeech,
title = {Effects Of Voice Type And Task On L2 Learners’ Awareness Of Pronunciation Errors},
author = {A. Silpachai and I. Rehman and T. A. Barriuso and J. Levis and E. Chukharev-Hudilainen and G. Zhao and R. Gutierrez-Osuna},
year = {2021},
date = {2021-08-30},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Liberatore, C.; Gutierrez-Osuna, R. An Exemplar Selection Algorithm For Native-Nonnative Voice Conversion Proceedings Article In: Proc. Interspeech, 2021. @inproceedings{chris2021ARS,
title = {An Exemplar Selection Algorithm For Native-Nonnative Voice Conversion},
author = {C. Liberatore and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2022/07/liberatore21_interspeech.pdf},
year = {2021},
date = {2021-08-30},
urldate = {2021-08-30},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Zhao, G.; Ding, S.; Gutierrez-Osuna, R. Converting Foreign Accent Speech Without a Reference Journal Article In: IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 29, pp. 2367, 2021. @article{guanlong2021reference-free,
title = {Converting Foreign Accent Speech Without a Reference},
author = {G. Zhao and S. Ding and R. Gutierrez-Osuna},
url = {https://ieeexplore.ieee.org/abstract/document/9477581
https://psi.engr.tamu.edu/wp-content/uploads/2021/08/zhao2021reference.pdf},
year = {2021},
date = {2021-07-01},
urldate = {2021-07-01},
journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
volume = {29},
pages = {2367},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
2020
|
Ding, S.; Zhao, G.; Gutierrez-Osuna, R. Improving the Speaker Identity of Non-Parallel Many-to-Many Voice Conversion with Adversarial Speaker Recognition Proceedings Article In: Proc. Interspeech, 2020. @inproceedings{shaojin-2020-interspeech,
title = {Improving the Speaker Identity of Non-Parallel Many-to-Many Voice Conversion with Adversarial Speaker Recognition},
author = {S. Ding and G. Zhao and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2020/08/IS2020_shaojin_Adversarial_speaker_classifier_camera_ready.pdf},
year = {2020},
date = {2020-10-25},
urldate = {2020-10-25},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Das, A.; Zhao, G.; Levis, J.; Chukharev-Hudilainen, E.; Gutierrez-Osuna, R. Understanding the Effect of Voice Quality and Accent on Talker Similarity Proceedings Article In: Proc. Interspeech, 2020. @inproceedings{anurag-2020-interspeech,
title = {Understanding the Effect of Voice Quality and Accent on Talker Similarity},
author = {A. Das and G. Zhao and J. Levis and E. Chukharev-Hudilainen and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2023/02/das2020interspeech.pdf},
year = {2020},
date = {2020-10-24},
urldate = {2020-10-24},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Lučić, I.; Silpachai, A.; Levis, J.; Zhao, G; Gutierrez-Osuna, R. The English Pronunciation of Arabic Speakers - A Data-Driven Approach to Segmental Error Identification Journal Article In: Language Teaching Research, 2020. @article{ivana2020ltr,
title = {The English Pronunciation of Arabic Speakers - A Data-Driven Approach to Segmental Error Identification},
author = {I. Lučić and A. Silpachai and J. Levis and G Zhao and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2020/06/1362168820931888.pdf},
year = {2020},
date = {2020-06-18},
journal = {Language Teaching Research},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
2019
|
Ding, S.; Zhao, G; Liberatore, C.; Gutierrez-Osuna, R. Learning Structured Sparse Representations for Voice Conversion Journal Article In: IEEE Transactions on Audio, Speech and Language Processing, vol. 28, pp. 343-354, 2019. @article{shaojin-2019-taslp,
title = {Learning Structured Sparse Representations for Voice Conversion},
author = {S. Ding and G Zhao and C. Liberatore and R. Gutierrez-Osuna},
url = {https://ieeexplore.ieee.org/document/8910392
https://psi.engr.tamu.edu/wp-content/uploads/2020/04/shaojin2019taslp.pdf},
doi = {10.1109/TASLP.2019.2955289},
year = {2019},
date = {2019-11-15},
journal = {IEEE Transactions on Audio, Speech and Language Processing},
volume = {28},
pages = {343-354},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
Ding, S.; Liberatore, C.; Sonsaat, S.; Lučić, I.; Silpachai, A.; Zhao, G; Chukharev-Hudilainen, E.; Levis, J.; Gutierrez-Osuna, R. Golden speaker builder – An interactive tool for pronunciation training Journal Article In: Speech Communication, vol. 115, pp. 51-66, 2019. @article{shaojin-2019-speechcomm,
title = {Golden speaker builder – An interactive tool for pronunciation training},
author = {S. Ding and C. Liberatore and S. Sonsaat and I. Lučić and A. Silpachai and G Zhao and E. Chukharev-Hudilainen and J. Levis and R. Gutierrez-Osuna},
url = {https://www.sciencedirect.com/science/article/pii/S0167639319302675
https://psi.engr.tamu.edu/wp-content/uploads/2019/11/1-s2.0-S0167639319302675-main.pdf
},
year = {2019},
date = {2019-11-14},
journal = {Speech Communication},
volume = {115},
pages = {51-66},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
Ding, S.; Gutierrez-Osuna, Ricardo Group Latent Embedding for Vector Quantized Variational Autoencoder in Non-Parallel Voice Conversion Proceedings Article In: Proc. Interspeech, 2019. @inproceedings{shaojin2019-interspeech,
title = {Group Latent Embedding for Vector Quantized Variational Autoencoder in Non-Parallel Voice Conversion},
author = {S. Ding and Ricardo Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2019/06/ding2019interspeech.pdf},
doi = {10.21437/Interspeech.2019-1198},
year = {2019},
date = {2019-09-15},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Zhao, G; Ding, S.; Gutierrez-Osuna, Ricardo Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams Proceedings Article In: Proc. Interspeech, 2019. @inproceedings{guanlong2019-interspeech,
title = {Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams},
author = {G Zhao and S. Ding and Ricardo Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2019/07/zhao2019interspeech.pdf},
year = {2019},
date = {2019-09-15},
urldate = {2019-09-15},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Zhao, G; Gutierrez-Osuna, R Using Phonetic Posteriorgram Based Frame Pairing for Segmental Accent Conversion Journal Article In: IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 27, no. 10, pp. 1649-1660, 2019, ISSN: 2329-9290. @article{zhao-2019-taslp,
title = {Using Phonetic Posteriorgram Based Frame Pairing for Segmental Accent Conversion},
author = {G Zhao and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2020/04/zhao2019taslp.pdf},
doi = {10.1109/TASLP.2019.2926754},
issn = {2329-9290},
year = {2019},
date = {2019-07-04},
urldate = {2019-07-04},
journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
volume = {27},
number = {10},
pages = {1649-1660},
abstract = {Accent conversion (AC) aims to transform non-native utterances to sound as if the speaker had a native accent. This can be achieved by mapping source speech spectra from a native speaker into the acoustic space of the target non-native speaker. In prior work, we proposed an AC approach that matches frames between the two speakers based on their acoustic similarity after compensating for differences in vocal tract length. In this paper, we propose a new approach that matches frames between the two speakers based on their phonetic (rather than acoustic) similarity. Namely, we map frames from the two speakers into a phonetic posteriorgram using speaker-independent acoustic models trained on native speech. We thoroughly evaluate the approach on a speech corpus containing multiple native and non-native speakers. The proposed algorithm outperforms the prior approach, improving ratings of acoustic quality (22% increase in mean opinion score) and native accent (69% preference) while retaining the voice quality of the non-native speaker. Furthermore, we show that the approach can be used in the reverse conversion direction, i.e., generating speech with a native speaker's voice quality and a non-native accent. Finally, we show that this approach can be applied to non-parallel training data, achieving the same accent conversion performance.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Accent conversion (AC) aims to transform non-native utterances to sound as if the speaker had a native accent. This can be achieved by mapping source speech spectra from a native speaker into the acoustic space of the target non-native speaker. In prior work, we proposed an AC approach that matches frames between the two speakers based on their acoustic similarity after compensating for differences in vocal tract length. In this paper, we propose a new approach that matches frames between the two speakers based on their phonetic (rather than acoustic) similarity. Namely, we map frames from the two speakers into a phonetic posteriorgram using speaker-independent acoustic models trained on native speech. We thoroughly evaluate the approach on a speech corpus containing multiple native and non-native speakers. The proposed algorithm outperforms the prior approach, improving ratings of acoustic quality (22% increase in mean opinion score) and native accent (69% preference) while retaining the voice quality of the non-native speaker. Furthermore, we show that the approach can be used in the reverse conversion direction, i.e., generating speech with a native speaker's voice quality and a non-native accent. Finally, we show that this approach can be applied to non-parallel training data, achieving the same accent conversion performance. |
2018
|
Levis, J.; Chukharev-Hudilainen, E.; Gutierrez-Osuna, R.; Lucic, I.; Silpachai, A.; Sonsaat, S. Golden Speaker: Learner Experience with Computer-assisted Pronunciation Practice Proceedings Article In: Proc. Pronunciation in Second Language Learning and Teaching Conference, 2018. @inproceedings{levis2018psslt,
title = {Golden Speaker: Learner Experience with Computer-assisted Pronunciation Practice},
author = {J. Levis and E. Chukharev-Hudilainen and R. Gutierrez-Osuna and I. Lucic and A. Silpachai and S. Sonsaat},
year = {2018},
date = {2018-09-06},
booktitle = {Proc. Pronunciation in Second Language Learning and Teaching Conference},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Zhao, G; Sonsaat, S; Silpachai, A; Lucic, I; Chukharev-Hudilainen, E; Levis, J; Gutierrez-Osuna, R L2-ARCTIC: A Non-Native English Speech Corpus Proceedings Article In: Proc. Interspeech, 2018. @inproceedings{zhao2018interspeech,
title = {L2-ARCTIC: A Non-Native English Speech Corpus},
author = {G Zhao and S Sonsaat and A Silpachai and I Lucic and E Chukharev-Hudilainen and J Levis and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/08/zhao2018interspeech.pdf
https://psi.engr.tamu.edu/l2-arctic-corpus/},
year = {2018},
date = {2018-09-02},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Ding, S.; Zhao, G; Liberatore, C.; Gutierrez-Osuna, R. Improving Sparse Representations in Exemplar-Based Voice Conversion with a Phoneme-Selective Objective Function Proceedings Article In: Proc. Interspeech, 2018. @inproceedings{ding2018interspeech2,
title = {Improving Sparse Representations in Exemplar-Based Voice Conversion with a Phoneme-Selective Objective Function},
author = {S. Ding and G Zhao and C. Liberatore and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/09/ding2018interspeech2.pdf},
doi = {10.21437/Interspeech.2018-1272},
year = {2018},
date = {2018-09-02},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Ding, S.; Liberatore, C.; Gutierrez-Osuna, R. Learning Structured Dictionaries for Exemplar-based Voice Conversion Proceedings Article In: Proc. Interspeech, 2018. @inproceedings{ding2018interspeech1,
title = {Learning Structured Dictionaries for Exemplar-based Voice Conversion},
author = {S. Ding and C. Liberatore and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/09/ding2018interspeech1.pdf},
doi = {10.21437/Interspeech.2018-1295},
year = {2018},
date = {2018-09-02},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Liberatore, C; Zhao, G; Gutierrez-Osuna, R Voice Conversion through Residual Warping in a Sparse, Anchor-Based Representation of Speech Proceedings Article In: Proc. ICASSP, 2018. @inproceedings{liberatore2018icassp,
title = {Voice Conversion through Residual Warping in a Sparse, Anchor-Based Representation of Speech},
author = {C Liberatore and G Zhao and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/03/liberatore-icassp2018.pdf},
year = {2018},
date = {2018-04-15},
booktitle = {Proc. ICASSP},
abstract = {In previous work we presented a Sparse, Anchor-Based Representation of speech (SABR) that uses phonemic “anchors” to represent an utterance with a set of sparse non-negative weights. SABR is speaker-independent: combining weights from a source speaker with anchors from a target speaker can be used for voice conversion. Here, we present an extension of the original SABR that significantly improves voice conversion synthesis. Namely, we take the residual signal from the SABR decomposition of the source speaker’s utterance, and warp it to the target speaker’s space using a weighted warping function learned from pairs of source-target
anchors. Using subjective and objective evaluations, we examine the performance of adding the warped residual (SABR+Res) to the original synthesis (SABR). Specifically, listeners rated SABR+Res with an average mean opinion score (MOS) of 3.6, a significant improvement compared to 2.2 MOS for SABR alone (𝑝 < 0.01) and
2.5 MOS for a baseline GMM method (𝑝 < 0.01). In an XAB speaker identity test, listeners correctly identified the identity of SABR+Res (81%) and SABR (84%) as requently as a GMM method (82%) (𝑝 = 0.70, 𝑝 = 0.35). These results indicate that adding the warped residual can dramatically improve synthesis while retaining the
desirable independent qualities of SABR models.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
In previous work we presented a Sparse, Anchor-Based Representation of speech (SABR) that uses phonemic “anchors” to represent an utterance with a set of sparse non-negative weights. SABR is speaker-independent: combining weights from a source speaker with anchors from a target speaker can be used for voice conversion. Here, we present an extension of the original SABR that significantly improves voice conversion synthesis. Namely, we take the residual signal from the SABR decomposition of the source speaker’s utterance, and warp it to the target speaker’s space using a weighted warping function learned from pairs of source-target
anchors. Using subjective and objective evaluations, we examine the performance of adding the warped residual (SABR+Res) to the original synthesis (SABR). Specifically, listeners rated SABR+Res with an average mean opinion score (MOS) of 3.6, a significant improvement compared to 2.2 MOS for SABR alone (𝑝 < 0.01) and
2.5 MOS for a baseline GMM method (𝑝 < 0.01). In an XAB speaker identity test, listeners correctly identified the identity of SABR+Res (81%) and SABR (84%) as requently as a GMM method (82%) (𝑝 = 0.70, 𝑝 = 0.35). These results indicate that adding the warped residual can dramatically improve synthesis while retaining the
desirable independent qualities of SABR models. |
Zhao, G; Sonsaat, S; Levis, J; Chukharev-Hudilainen, E; Gutierrez-Osuna, R Accent conversion using phonetic posteriorgrams Proceedings Article In: Proc. ICASSP, 2018. @inproceedings{zhao2018icassp,
title = {Accent conversion using phonetic posteriorgrams},
author = {G Zhao and S Sonsaat and J Levis and E Chukharev-Hudilainen and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/03/zhao2018icassp.pdf
http://people.tamu.edu/~guanlong.zhao/icassp18_demo.html
https://psi.engr.tamu.edu/l2-arctic-corpus/
https://github.com/guanlongzhao/ppg-gmm},
year = {2018},
date = {2018-04-15},
urldate = {2018-04-15},
booktitle = {Proc. ICASSP},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2016
|
Aryal, S; Gutierrez-Osuna, R Comparing Articulatory and Acoustic Strategies for Reducing Non-Native Accents Proceedings Article In: Proc. Interspeech, 2016. @inproceedings{aryal-2016-interspeech,
title = {Comparing Articulatory and Acoustic Strategies for Reducing Non-Native Accents},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2016interspeech.pdf},
year = {2016},
date = {2016-09-08},
booktitle = {Proc. Interspeech},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Aryal, S; Gutierrez-Osuna, R Data driven articulatory synthesis with deep neural networks Journal Article In: Computer Speech and Language, vol. 36, pp. 260-273, 2016. @article{aryal-2015-cls,
title = {Data driven articulatory synthesis with deep neural networks},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2016csl.pdf},
year = {2016},
date = {2016-03-01},
urldate = {2016-03-01},
journal = {Computer Speech and Language},
volume = {36},
pages = {260-273},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
2015
|
Liberatore, C; Aryal, S; Wang, Z; Polsley, S; Gutierrez-Osuna, R SABR: Sparse, Anchor-Based Representation of the Speech Signal Proceedings Article In: Proc. Interspeech 2015, pp. 608-612, 2015. @inproceedings{liberatore2015interspeech,
title = {SABR: Sparse, Anchor-Based Representation of the Speech Signal},
author = {C Liberatore and S Aryal and Z Wang and S Polsley and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/liberatore2015interspeech.pdf},
year = {2015},
date = {2015-09-06},
booktitle = {Proc. Interspeech 2015},
pages = {608-612},
abstract = {We present SABR (Sparse, Anchor-Based Representation), an analysis technique to decompose the speech signal into
speaker-dependent and speaker-independent components. Given a collection of utterances for a particular
speaker, SABR uses the centroid for each phoneme as an acoustic “anchor,” then applies Lasso regularization to
the method on a speaker-independent phoneme recognition task and a voice conversion task. Using a linear classifier,
SABR weights achieve significantly higher phoneme recognition rates than Mel frequency Cepstral coefficients. SABR weights can also be used directly to perform accent conversion without the need to train a speakerto-speaker regression model.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
We present SABR (Sparse, Anchor-Based Representation), an analysis technique to decompose the speech signal into
speaker-dependent and speaker-independent components. Given a collection of utterances for a particular
speaker, SABR uses the centroid for each phoneme as an acoustic “anchor,” then applies Lasso regularization to
the method on a speaker-independent phoneme recognition task and a voice conversion task. Using a linear classifier,
SABR weights achieve significantly higher phoneme recognition rates than Mel frequency Cepstral coefficients. SABR weights can also be used directly to perform accent conversion without the need to train a speakerto-speaker regression model. |
Aryal, S; Gutierrez-Osuna, R Articulatory-based conversion of foreign accents with deep neural networks Proceedings Article In: Proc. Interspeech, pp. 3385-3389, 2015. @inproceedings{aryal2015interspeech,
title = {Articulatory-based conversion of foreign accents with deep neural networks},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2015interspeech.pdf},
year = {2015},
date = {2015-09-06},
urldate = {2015-09-06},
booktitle = {Proc. Interspeech},
pages = {3385-3389},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Liberatore, C; Gutierrez-Osuna, R Joint Optimization of Anatomical and Gestural Parameters in a Physical Vocal Tract Model Proceedings Article In: ICASSP, IEEE 2015. @inproceedings{liberatore2015icassp,
title = {Joint Optimization of Anatomical and Gestural Parameters in a Physical Vocal Tract Model},
author = {C Liberatore and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/liberatore2015icassp.pdf},
year = {2015},
date = {2015-04-19},
booktitle = {ICASSP},
organization = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Aryal, S; Gutierrez-Osuna, R Reduction of non-native accents through statistical parametric articulatory synthesis Journal Article In: Journal of the Acoustical Society of America, vol. 137, no. 1, pp. 433-446, 2015. @article{aryal2015jasa,
title = {Reduction of non-native accents through statistical parametric articulatory synthesis},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2015jasa.pdf},
year = {2015},
date = {2015-01-23},
journal = {Journal of the Acoustical Society of America},
volume = {137},
number = {1},
pages = {433-446},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
2014
|
Felps, D; Aryal, S; Gutierrez-Osuna, R Normalization of articulatory data through Procrustes transformations and analysis-by-synthesis Proceedings Article In: Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 3051-3055, 2014. @inproceedings{danielprocrustes2014icassp,
title = {Normalization of articulatory data through Procrustes transformations and analysis-by-synthesis},
author = {D Felps and S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/danielprocrustes2014icassp.pdf},
year = {2014},
date = {2014-05-09},
booktitle = {Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages = {3051-3055},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Aryal, S; Gutierrez-Osuna, R Can voice conversion be used to reduce non-native accents Proceedings Article In: Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 7929-7933, 2014. @inproceedings{sandeshaccentconversion2014icassp,
title = {Can voice conversion be used to reduce non-native accents},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/sandeshaccentconversion2014icassp.pdf},
year = {2014},
date = {2014-05-09},
booktitle = {Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages = {7929-7933},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Aryal, S; Gutierrez-Osuna, R Accent conversion through cross-speaker articulatory synthesis Proceedings Article In: Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 7744-7748, 2014. @inproceedings{sandesh2014icassp,
title = {Accent conversion through cross-speaker articulatory synthesis},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/sandesh2014icassp.pdf},
year = {2014},
date = {2014-05-09},
booktitle = {Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages = {7744-7748},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2013
|
Aryal, S; Felps, D; Gutierrez-Osuna, R Foreign Accent Conversion through Voice Morphing Proceedings Article In: Interspeech, pp. 3077-3081, 2013. @inproceedings{aryal2013interspeech,
title = {Foreign Accent Conversion through Voice Morphing},
author = {S Aryal and D Felps and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2013interspeech.pdf},
year = {2013},
date = {2013-08-25},
booktitle = {Interspeech},
pages = {3077-3081},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2012
|
Felps, D; Geng, C; Gutierrez-Osuna, R Foreign accent conversion through concatenative synthesis in the articulatory domain Journal Article In: IEEE Transactions on Audio, Speech and Language Processing, 2012. @article{felps2012taslp,
title = {Foreign accent conversion through concatenative synthesis in the articulatory domain},
author = {D Felps and C Geng and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps2012taslp.pdf},
year = {2012},
date = {2012-01-01},
journal = {IEEE Transactions on Audio, Speech and Language Processing},
abstract = {We propose a concatenative synthesis approach to the problem of foreign accent conversion. The approach consists of replacing the most accented portions of nonnative speech with alternative segments from a corpus of the speaker’s own speech based on their similarity to those from a reference native speaker. We propose and compare two approaches for selecting units, one based on acoustic similarity [e.g., mel frequency cepstral coeffi-cients (MFCCs)] and a second one based on articulatory similarity, as measured through electromagnetic articulography (EMA). Our hypothesis is that articulatory features provide a better metric for linguistic similarity across speakers than acoustic features. To test this hypothesis, we recorded an articulatory-acoustic corpus from a native and a nonnative speaker, and
evaluated the two speech representations (acoustic versus articulatory) through a series of perceptual experiments. Formal listening tests indicate that the approach can achieve a 20% reduction in perceived accent, but also reveal a strong coupling between accent and speaker identity. To address this issue, we disguised original and resynthesized utterances by altering their average pitch and normalizing vocal tract length. An additional listening experiment supports the hypothesis that articulatory features are less speaker dependent than acoustic features.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
We propose a concatenative synthesis approach to the problem of foreign accent conversion. The approach consists of replacing the most accented portions of nonnative speech with alternative segments from a corpus of the speaker’s own speech based on their similarity to those from a reference native speaker. We propose and compare two approaches for selecting units, one based on acoustic similarity [e.g., mel frequency cepstral coeffi-cients (MFCCs)] and a second one based on articulatory similarity, as measured through electromagnetic articulography (EMA). Our hypothesis is that articulatory features provide a better metric for linguistic similarity across speakers than acoustic features. To test this hypothesis, we recorded an articulatory-acoustic corpus from a native and a nonnative speaker, and
evaluated the two speech representations (acoustic versus articulatory) through a series of perceptual experiments. Formal listening tests indicate that the approach can achieve a 20% reduction in perceived accent, but also reveal a strong coupling between accent and speaker identity. To address this issue, we disguised original and resynthesized utterances by altering their average pitch and normalizing vocal tract length. An additional listening experiment supports the hypothesis that articulatory features are less speaker dependent than acoustic features. |
2010
|
Gutierrez-Osuna, R; Felps, D Foreign Accent Conversion through Voice Morphing Technical Report 2010. @techreport{gutierrez2010techreport,
title = {Foreign Accent Conversion through Voice Morphing},
author = {R Gutierrez-Osuna and D Felps},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/gutierrez2010techreport.pdf},
year = {2010},
date = {2010-05-05},
abstract = {We present a voice morphing strategy that can be used to generate a continuum of accent transformations between a foreign speaker and a native speaker. The approach performs a cepstral decomposition of speech into spectral slope and spectral detail. Accent conversions are then generated by combining the spectral slope of the foreign speaker with a morph of the spectral detail of the native speaker. Spectral morphing is achieved by representing the spectral detail through pulse density modulation and averaging pulses in a pair-wise fashion. The technique is evaluated on parallel recordings from two ARCTIC speakers using objective measures of acoustic quality, speaker identity and foreign accent hat have been recently shown to correlate with perceptual results from listening tests.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
We present a voice morphing strategy that can be used to generate a continuum of accent transformations between a foreign speaker and a native speaker. The approach performs a cepstral decomposition of speech into spectral slope and spectral detail. Accent conversions are then generated by combining the spectral slope of the foreign speaker with a morph of the spectral detail of the native speaker. Spectral morphing is achieved by representing the spectral detail through pulse density modulation and averaging pulses in a pair-wise fashion. The technique is evaluated on parallel recordings from two ARCTIC speakers using objective measures of acoustic quality, speaker identity and foreign accent hat have been recently shown to correlate with perceptual results from listening tests. |
Felps, D; Geng, C; Berger, M; Richmond, K; Gutierrez-Osuna, R Relying on critical articulators to estimate vocal tract spectra in an articulatory-acoustic database Conference Interspeech, 2010. @conference{felps2010interspeech,
title = {Relying on critical articulators to estimate vocal tract spectra in an articulatory-acoustic database},
author = {D Felps and C Geng and M Berger and K Richmond and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps2010interspeech.pdf},
year = {2010},
date = {2010-01-01},
booktitle = {Interspeech},
abstract = {We present a new phone-dependent feature weighting scheme that can be used to map articulatory configurations (e.g. EMA) onto vocal tract spectra (e.g. MFCC) through table lookup. The approach consists of assigning feature weights according to a feature's ability to predict the acoustic distance between frames. Since an articulator's predictive accuracy is phonedependent (e.g., lip location is a better predictor for bilabial sounds than for palatal sounds), a unique weight vector is found for each phone. Inspection of the weights reveals a correspondence with the expected critical articulators for many phones. The proposed method reduces overall cepstral error by 6% when compared to a uniform weighting scheme. Vowels show the greatest benefit, though improvements occur for 80% of the tested phones.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
We present a new phone-dependent feature weighting scheme that can be used to map articulatory configurations (e.g. EMA) onto vocal tract spectra (e.g. MFCC) through table lookup. The approach consists of assigning feature weights according to a feature's ability to predict the acoustic distance between frames. Since an articulator's predictive accuracy is phonedependent (e.g., lip location is a better predictor for bilabial sounds than for palatal sounds), a unique weight vector is found for each phone. Inspection of the weights reveals a correspondence with the expected critical articulators for many phones. The proposed method reduces overall cepstral error by 6% when compared to a uniform weighting scheme. Vowels show the greatest benefit, though improvements occur for 80% of the tested phones. |
Felps, D; Gutierrez-Osuna, R Developing objective measures of foreign-accent conversion Journal Article In: Audio, Speech, and Language Processing, IEEE Transactions on, vol. 18, no. 5, pp. 1030–1040, 2010. @article{felps2010talsp,
title = {Developing objective measures of foreign-accent conversion},
author = {D Felps and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps2010talsp.pdf},
year = {2010},
date = {2010-01-01},
journal = {Audio, Speech, and Language Processing, IEEE Transactions on},
volume = {18},
number = {5},
pages = {1030--1040},
publisher = {IEEE},
abstract = {Various methods have recently appeared to transform foreign-accented speech into its native-accented counterpart. Evaluation of these accent conversion methods requires extensive listening tests across a number of perceptual dimensions. This article presents three objective measures that may be used to assess the acoustic quality, degree of foreign accent, and speaker identity of accent-converted utterances. Accent conversion generates novel utterances: those of a foreign speaker with a native accent. Therefore, the acoustic quality in accent conversion cannot be evaluated with conventional measures of spectral distortion, which assume that a clean recording of the speech signal is available for comparison. Here we evaluate a single-ended measure of speech quality, lTV -T recommendation P.563 for narrow-band telephony. We also propose a measure of foreign accent that exploits a weakness of automatic speech recognizers: their sensitivity to foreign accents. Namely, we use phoneme-level match scores given by the HTK recognizer trained on a large number of English American speakers to obtain a measure of native accent. Finally, we propose a measure of speaker identity that projects acoustic vectors (e.g., Mel cepstral, F0) onto the linear discriminant that maximizes separability for a given pair of source and target speakers. The three measures are evaluated on a corpus of accent-converted utterances that had been previously rated through perceptual tests. Our results show that the three measures have a high degree of correlation with their corresponding subjective ratings, suggesting that they may be used to accelerate the development of foreign-accent conversion tools. Applications of these measures in the context of computer assisted pronunciation training and voice conversion are also discussed.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Various methods have recently appeared to transform foreign-accented speech into its native-accented counterpart. Evaluation of these accent conversion methods requires extensive listening tests across a number of perceptual dimensions. This article presents three objective measures that may be used to assess the acoustic quality, degree of foreign accent, and speaker identity of accent-converted utterances. Accent conversion generates novel utterances: those of a foreign speaker with a native accent. Therefore, the acoustic quality in accent conversion cannot be evaluated with conventional measures of spectral distortion, which assume that a clean recording of the speech signal is available for comparison. Here we evaluate a single-ended measure of speech quality, lTV -T recommendation P.563 for narrow-band telephony. We also propose a measure of foreign accent that exploits a weakness of automatic speech recognizers: their sensitivity to foreign accents. Namely, we use phoneme-level match scores given by the HTK recognizer trained on a large number of English American speakers to obtain a measure of native accent. Finally, we propose a measure of speaker identity that projects acoustic vectors (e.g., Mel cepstral, F0) onto the linear discriminant that maximizes separability for a given pair of source and target speakers. The three measures are evaluated on a corpus of accent-converted utterances that had been previously rated through perceptual tests. Our results show that the three measures have a high degree of correlation with their corresponding subjective ratings, suggesting that they may be used to accelerate the development of foreign-accent conversion tools. Applications of these measures in the context of computer assisted pronunciation training and voice conversion are also discussed. |
2009
|
Felps, D; Bortfeld, H; Gutierrez-Osuna, R Foreign accent conversion in computer assisted pronunciation training Journal Article In: Speech communication, vol. 51, no. 10, pp. 920–932, 2009. @article{felps2009foreign,
title = {Foreign accent conversion in computer assisted pronunciation training},
author = {D Felps and H Bortfeld and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps2009foreign.pdf},
year = {2009},
date = {2009-01-01},
journal = {Speech communication},
volume = {51},
number = {10},
pages = {920--932},
publisher = {Elsevier},
abstract = {Learners of a second language practice their pronunciation by listening to and imitating utterances from native speakers. Recent research has shown that choosing a well-matched native speaker to imitate can have a positive impact on pronunciation training. Here we propose a voice-transformation technique that can be used to generate the (arguably) ideal voice to imitate: the own voice of the learner with a native accent. Our work extends previous research, which suggests that providing learners with prosodically corrected versions of their utterances can be a suitable form of feedback in computer assisted pronunciation training. Our technique provides a conversion of both prosodic and segmental characteristics by means of a pitch-synchronous decomposition of speech into glottal excitation and spectral envelope. We apply the technique to a corpus containing parallel recordings of foreign-accented and native-accented utterances, and validate the resulting accent conversions through a series of perceptual experiments. Our results indicate that the technique can reduce foreign accentedness without significantly altering the voice quality properties of the foreign speaker. Finally, we propose a pedagogical strategy for integrating accent conversion as a form of behavioral shaping in computer assisted pronunciation training.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Learners of a second language practice their pronunciation by listening to and imitating utterances from native speakers. Recent research has shown that choosing a well-matched native speaker to imitate can have a positive impact on pronunciation training. Here we propose a voice-transformation technique that can be used to generate the (arguably) ideal voice to imitate: the own voice of the learner with a native accent. Our work extends previous research, which suggests that providing learners with prosodically corrected versions of their utterances can be a suitable form of feedback in computer assisted pronunciation training. Our technique provides a conversion of both prosodic and segmental characteristics by means of a pitch-synchronous decomposition of speech into glottal excitation and spectral envelope. We apply the technique to a corpus containing parallel recordings of foreign-accented and native-accented utterances, and validate the resulting accent conversions through a series of perceptual experiments. Our results indicate that the technique can reduce foreign accentedness without significantly altering the voice quality properties of the foreign speaker. Finally, we propose a pedagogical strategy for integrating accent conversion as a form of behavioral shaping in computer assisted pronunciation training. |
2008
|
Felps, D; Bortfeld, H; Gutierrez-Osuna, R Prosodic and segmental factors in foreign-accent conversion Technical Report 2008. @techreport{felps08prosodic,
title = {Prosodic and segmental factors in foreign-accent conversion},
author = {D Felps and H Bortfeld and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps08prosodic.pdf},
year = {2008},
date = {2008-07-11},
abstract = {We propose a signal processing method that transforms foreign-accented speech to resemble its native-accented counterpart. The problem is closely related to voice conversion, except that our method seeks to preserve the organic properties of the foreign speaker’s voice; i.e., only those features which cue foreign-accentedness are to be transformed. Our method operates at two levels: prosodic and segmental. Prosodic transformation is performed by means of time and pitch scaling. Segmental transformation is performed by convolving the foreign speaker’s excitation with the warped spectral envelope of the native speaker. Perceptual results indicate that our model is able to provide a 63% reduction in foreign-accentedness. Multidimensional scaling also shows that the segmental transformation causes the perception of a new speaker to emerge, though the identity of this new speaker is three times closer to the foreign speaker than to the native speaker.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
We propose a signal processing method that transforms foreign-accented speech to resemble its native-accented counterpart. The problem is closely related to voice conversion, except that our method seeks to preserve the organic properties of the foreign speaker’s voice; i.e., only those features which cue foreign-accentedness are to be transformed. Our method operates at two levels: prosodic and segmental. Prosodic transformation is performed by means of time and pitch scaling. Segmental transformation is performed by convolving the foreign speaker’s excitation with the warped spectral envelope of the native speaker. Perceptual results indicate that our model is able to provide a 63% reduction in foreign-accentedness. Multidimensional scaling also shows that the segmental transformation causes the perception of a new speaker to emerge, though the identity of this new speaker is three times closer to the foreign speaker than to the native speaker. |