2024
|
A. Das W. Quamer, R. Gutierrez-Osuna Speech synthesis and pronunciation teaching Book Chapter In: J. Levis C. A. Chapelle, M. Munro; Huensch, A. (Ed.): 2024. @inbook{nokey,
title = {Speech synthesis and pronunciation teaching},
author = {W. Quamer, A. Das, R. Gutierrez-Osuna},
editor = {C. A. Chapelle, J. Levis, M. Munro, C. Nagle, and A. Huensch},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2024/09/book-chapter-preprint.pdf},
year = {2024},
date = {2024-12-31},
urldate = {2024-12-31},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inbook}
}
|
Quamer, W.; Gutierrez-Osuna, R. End-to-end streaming model for low-latency speech anonymization Proceedings Article In: Proc. IEEE Spoken Language Technology Workshop (SLT 2024), 2024. @inproceedings{nokey,
title = {End-to-end streaming model for low-latency speech anonymization},
author = {W. Quamer and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2024/09/waris2024slt.pdf},
year = {2024},
date = {2024-12-02},
urldate = {2024-12-02},
booktitle = {Proc. IEEE Spoken Language Technology Workshop (SLT 2024)},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Das, A.; R. Gutierrez-Osuna, Improving mispronunciation detection using speech reconstruction Journal Article Forthcoming In: IEEE/ACM Transactions on Audio, Speech and Language Processing, Forthcoming. @article{nokey,
title = {Improving mispronunciation detection using speech reconstruction},
author = {A. Das and R. Gutierrez-Osuna,},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2024/07/anurag-taslp-2024.pdf},
year = {2024},
date = {2024-07-19},
journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
keywords = {Accent conversion, Speech},
pubstate = {forthcoming},
tppubtype = {article}
}
|
R. Neiriz A. Silpachai, M. Novotny; Chukharev, E. Corrective feedback accuracy and pronunciation improvement: Feedback that is ‘good enough’ Journal Article In: Language Learning & Technology, vol. 28, iss. 1, pp. 1–16, 2024. @article{nokey,
title = {Corrective feedback accuracy and pronunciation improvement: Feedback that is ‘good enough’},
author = {A. Silpachai, R. Neiriz, M. Novotny, R. Gutierrez-Osuna, J. M. Levis and E. Chukharev
},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2024/07/llt-2024-good-enough.pdf
},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
journal = {Language Learning & Technology},
volume = {28},
issue = {1},
pages = {1–16},
keywords = {Speech},
pubstate = {published},
tppubtype = {article}
}
|
2023
|
Anurag Das Waris Quamer, Ricardo Gutierrez-Osuna Decoupling segmental and prosodic cues of non-native speech through vector quantization Proceedings Article In: Proc. Interspeech, 2023. @inproceedings{nokey,
title = {Decoupling segmental and prosodic cues of non-native speech through vector quantization},
author = {Waris Quamer, Anurag Das, Ricardo Gutierrez-Osuna
},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2023/11/decouplingProsody2023waris.pdf},
year = {2023},
date = {2023-08-20},
urldate = {2023-08-20},
booktitle = {Proc. Interspeech},
journal = {Interspeech 2023},
keywords = {Accent conversion, Speech, Voice conversion},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2022
|
Quamer, W.; Das, A.; Levis, J.; Chukharev-Hudilainen, E.; Gutierrez-Osuna, R. Zero-Shot Foreign Accent Conversion without a Native Reference Proceedings Article Forthcoming In: Proc. Interspeech, Forthcoming. @inproceedings{waris2022interspeech,
title = {Zero-Shot Foreign Accent Conversion without a Native Reference},
author = {W. Quamer and A. Das and J. Levis and E. Chukharev-Hudilainen and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2023/02/quamer2022interspeech.pdf},
year = {2022},
date = {2022-09-18},
urldate = {2022-09-18},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Deep learning, Speech},
pubstate = {forthcoming},
tppubtype = {inproceedings}
}
|
Liberatore, C.; Gutierrez-Osuna, R. Minimizing residuals for native-nonnative voice conversion in a sparse, anchor-based representation of speech Proceedings Article In: Proc. ICASSP, 2022. @inproceedings{liberatore2022icassp,
title = {Minimizing residuals for native-nonnative voice conversion in a sparse, anchor-based representation of speech},
author = {C. Liberatore and R. Gutierrez-Osuna},
year = {2022},
date = {2022-05-22},
booktitle = {Proc. ICASSP},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2021
|
Ding, S.; Zhao, G.; Gutierrez-Osuna, R. Accentron: Foreign accent conversion to arbitrary non-native speakers using zero-shot learning Journal Article In: Computer Speech & Language, 2021. @article{shaojin2021accentron,
title = {Accentron: Foreign accent conversion to arbitrary non-native speakers using zero-shot learning},
author = {S. Ding and G. Zhao and R. Gutierrez-Osuna},
url = {https://www.sciencedirect.com/science/article/pii/S0885230821001029
https://psi.engr.tamu.edu/wp-content/uploads/2021/10/1-s2.0-S0885230821001029-main.pdf},
year = {2021},
date = {2021-10-14},
urldate = {2021-10-14},
journal = {Computer Speech & Language},
keywords = {Accent conversion, Deep learning, Speech},
pubstate = {published},
tppubtype = {article}
}
|
Liberatore, C.; Gutierrez-Osuna, R. An Exemplar Selection Algorithm For Native-Nonnative Voice Conversion Proceedings Article In: Proc. Interspeech, 2021. @inproceedings{chris2021ARS,
title = {An Exemplar Selection Algorithm For Native-Nonnative Voice Conversion},
author = {C. Liberatore and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2022/07/liberatore21_interspeech.pdf},
year = {2021},
date = {2021-08-30},
urldate = {2021-08-30},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Silpachai, A.; Rehman, I.; Barriuso, T. A.; Levis, J.; Chukharev-Hudilainen, E.; Zhao, G.; Gutierrez-Osuna, R. Effects Of Voice Type And Task On L2 Learners’ Awareness Of Pronunciation Errors Proceedings Article In: Proc. Interspeech, 2021. @inproceedings{alif2021interspeech,
title = {Effects Of Voice Type And Task On L2 Learners’ Awareness Of Pronunciation Errors},
author = {A. Silpachai and I. Rehman and T. A. Barriuso and J. Levis and E. Chukharev-Hudilainen and G. Zhao and R. Gutierrez-Osuna},
year = {2021},
date = {2021-08-30},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Hair, A.; Zhao, G.; Ahmed, B.; Ballard, K. J.; Gutierrez-Osuna, R. Assessing Posterior-Based Mispronunciation Detection on Field-Collected Recordings from Child Speech Therapy Sessions Proceedings Article In: Proc. Interspeech, 2021. @inproceedings{adam2021interspeech,
title = {Assessing Posterior-Based Mispronunciation Detection on Field-Collected Recordings from Child Speech Therapy Sessions},
author = {A. Hair and G. Zhao and B. Ahmed and K. J. Ballard and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2021/08/hair_interspeech_2021.pdf},
year = {2021},
date = {2021-08-30},
booktitle = {Proc. Interspeech},
keywords = {Automatic Speech Recognition, Childhood apraxia of speech, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Zhao, G.; Ding, S.; Gutierrez-Osuna, R. Converting Foreign Accent Speech Without a Reference Journal Article In: IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 29, pp. 2367, 2021. @article{guanlong2021reference-free,
title = {Converting Foreign Accent Speech Without a Reference},
author = {G. Zhao and S. Ding and R. Gutierrez-Osuna},
url = {https://ieeexplore.ieee.org/abstract/document/9477581
https://psi.engr.tamu.edu/wp-content/uploads/2021/08/zhao2021reference.pdf},
year = {2021},
date = {2021-07-01},
urldate = {2021-07-01},
journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
volume = {29},
pages = {2367},
keywords = {Accent conversion, Deep learning, Speech},
pubstate = {published},
tppubtype = {article}
}
|
Hair, A.; Ballard, K. J.; Markoulli, C.; Monroe, P.; McKechnie, J.; Ahmed, B.; Gutierrez-Osuna, R. A Longitudinal Evaluation of Tablet-Based Child Speech Therapy with Apraxia World Journal Article In: ACM Transactions On Accessible Computing, vol. 14, no. 1, 2021. @article{adam2021TACCESS,
title = {A Longitudinal Evaluation of Tablet-Based Child Speech Therapy with Apraxia World},
author = {A. Hair and K. J. Ballard and C. Markoulli and P. Monroe and J. McKechnie and B. Ahmed and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2021/08/hair_taccess_2021.pdf},
doi = {10.1145/3433607},
year = {2021},
date = {2021-03-15},
journal = {ACM Transactions On Accessible Computing},
volume = {14},
number = {1},
abstract = {Digital games can make speech therapy exercises more enjoyable for children and increase their motivation during therapy. However, many such games developed to date have not been designed for long-term use. To address this issue, we developed Apraxia World, a speech therapy game specifically intended to be played over extended periods. In this study, we examined pronunciation improvements, child engagement over time, and caregiver and automated pronunciation evaluation accuracy while using our game over a multi-month period. Ten children played Apraxia World at home during two counterbalanced four-week treatment blocks separated by a two-week break. In one treatment phase, children received pronunciation feedback from caregivers and in the other treatment phase, utterances were evaluated with an automated framework built into the game. We found that children made therapeutically significant speech improvements while using Apraxia World, and that the game successfully increased engagement during speech therapy practice. Additionally, in offline mispronunciation detection tests, our automated pronunciation evaluation framework outperformed a traditional method based on goodness of pronunciation scoring. Our results suggest that this type of speech therapy game is a valid complement to traditional home practice.},
keywords = {Games, Health, Speech},
pubstate = {published},
tppubtype = {article}
}
Digital games can make speech therapy exercises more enjoyable for children and increase their motivation during therapy. However, many such games developed to date have not been designed for long-term use. To address this issue, we developed Apraxia World, a speech therapy game specifically intended to be played over extended periods. In this study, we examined pronunciation improvements, child engagement over time, and caregiver and automated pronunciation evaluation accuracy while using our game over a multi-month period. Ten children played Apraxia World at home during two counterbalanced four-week treatment blocks separated by a two-week break. In one treatment phase, children received pronunciation feedback from caregivers and in the other treatment phase, utterances were evaluated with an automated framework built into the game. We found that children made therapeutically significant speech improvements while using Apraxia World, and that the game successfully increased engagement during speech therapy practice. Additionally, in offline mispronunciation detection tests, our automated pronunciation evaluation framework outperformed a traditional method based on goodness of pronunciation scoring. Our results suggest that this type of speech therapy game is a valid complement to traditional home practice. |
2020
|
Ding, S.; Zhao, G.; Gutierrez-Osuna, R. Improving the Speaker Identity of Non-Parallel Many-to-Many Voice Conversion with Adversarial Speaker Recognition Proceedings Article In: Proc. Interspeech, 2020. @inproceedings{shaojin-2020-interspeech,
title = {Improving the Speaker Identity of Non-Parallel Many-to-Many Voice Conversion with Adversarial Speaker Recognition},
author = {S. Ding and G. Zhao and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2020/08/IS2020_shaojin_Adversarial_speaker_classifier_camera_ready.pdf},
year = {2020},
date = {2020-10-25},
urldate = {2020-10-25},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Deep learning, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Das, A.; Zhao, G.; Levis, J.; Chukharev-Hudilainen, E.; Gutierrez-Osuna, R. Understanding the Effect of Voice Quality and Accent on Talker Similarity Proceedings Article In: Proc. Interspeech, 2020. @inproceedings{anurag-2020-interspeech,
title = {Understanding the Effect of Voice Quality and Accent on Talker Similarity},
author = {A. Das and G. Zhao and J. Levis and E. Chukharev-Hudilainen and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2023/02/das2020interspeech.pdf},
year = {2020},
date = {2020-10-24},
urldate = {2020-10-24},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Deep learning, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Lučić, I.; Silpachai, A.; Levis, J.; Zhao, G; Gutierrez-Osuna, R. The English Pronunciation of Arabic Speakers - A Data-Driven Approach to Segmental Error Identification Journal Article In: Language Teaching Research, 2020. @article{ivana2020ltr,
title = {The English Pronunciation of Arabic Speakers - A Data-Driven Approach to Segmental Error Identification},
author = {I. Lučić and A. Silpachai and J. Levis and G Zhao and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2020/06/1362168820931888.pdf},
year = {2020},
date = {2020-06-18},
journal = {Language Teaching Research},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {article}
}
|
McKechnie, J.; Ahmed, B.; Gutierrez-Osuna, R.; Murray, E.; McCabe, P.; Ballard, K. The influence of type of feedback during tablet-based delivery of intensive treatment for childhood apraxia of speech Journal Article In: Journal of Communication Disorders, 2020. @article{McKechnie2020tabbyTalks,
title = {The influence of type of feedback during tablet-based delivery of intensive treatment for childhood apraxia of speech},
author = {J. McKechnie and B. Ahmed and R. Gutierrez-Osuna and E. Murray and P. McCabe and K. Ballard},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2020/08/1-s2.0-S0021992420300940-main.pdf
https://www.sciencedirect.com/science/article/pii/S0021992420300940},
year = {2020},
date = {2020-05-20},
journal = {Journal of Communication Disorders},
keywords = {Health, Speech},
pubstate = {published},
tppubtype = {article}
}
|
Hair, A; Markoulli, C; Monroe, P; McKechnie, J; Ballard, K J; Ahmed, B; Gutierrez-Osuna, R Preliminary Results From a Longitudinal Study of a Tablet-Based Speech Therapy Game Proceedings Article In: Extended Abstracts of the 2020 CHI Conference on Human Factors in Computing, ACM, 2020, ISBN: 978-1-4503-6819-3/20/04. @inproceedings{hair2020chi,
title = {Preliminary Results From a Longitudinal Study of a Tablet-Based Speech Therapy Game},
author = {A Hair and C Markoulli and P Monroe and J McKechnie and K J Ballard and B Ahmed and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2020/02/hair2020chi.pdf},
doi = {10.1145/3334480.3382886},
isbn = {978-1-4503-6819-3/20/04},
year = {2020},
date = {2020-04-25},
booktitle = {Extended Abstracts of the 2020 CHI Conference on Human Factors in Computing},
publisher = {ACM},
abstract = {We previously developed a tablet-based speech therapy game called Apraxia World to address barriers to treatment and increase child motivation during therapy. In this study, we examined pronunciation improvements, child engagement over time, and caregiver evaluation performance while using our game. We recruited ten children to play Apraxia World at home during two four-week treatment blocks, separated by a two-week break; nine of ten have completed the protocol at time of writing. In the treatment blocks, children’s utterances were evaluated either by caregivers or an automated pronunciation framework. Preliminary analysis suggests that children made significant therapy gains with Apraxia World, even though caregivers evaluated pronunciation leniently. We also collected a corpus of child speech for offline examination. We will conduct additional analysis once all participants complete the protocol.},
keywords = {Automatic Speech Recognition, Childhood apraxia of speech, Health, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
We previously developed a tablet-based speech therapy game called Apraxia World to address barriers to treatment and increase child motivation during therapy. In this study, we examined pronunciation improvements, child engagement over time, and caregiver evaluation performance while using our game. We recruited ten children to play Apraxia World at home during two four-week treatment blocks, separated by a two-week break; nine of ten have completed the protocol at time of writing. In the treatment blocks, children’s utterances were evaluated either by caregivers or an automated pronunciation framework. Preliminary analysis suggests that children made significant therapy gains with Apraxia World, even though caregivers evaluated pronunciation leniently. We also collected a corpus of child speech for offline examination. We will conduct additional analysis once all participants complete the protocol. |
2019
|
Ding, S.; Zhao, G; Liberatore, C.; Gutierrez-Osuna, R. Learning Structured Sparse Representations for Voice Conversion Journal Article In: IEEE Transactions on Audio, Speech and Language Processing, vol. 28, pp. 343-354, 2019. @article{shaojin-2019-taslp,
title = {Learning Structured Sparse Representations for Voice Conversion},
author = {S. Ding and G Zhao and C. Liberatore and R. Gutierrez-Osuna},
url = {https://ieeexplore.ieee.org/document/8910392
https://psi.engr.tamu.edu/wp-content/uploads/2020/04/shaojin2019taslp.pdf},
doi = {10.1109/TASLP.2019.2955289},
year = {2019},
date = {2019-11-15},
journal = {IEEE Transactions on Audio, Speech and Language Processing},
volume = {28},
pages = {343-354},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {article}
}
|
Ding, S.; Liberatore, C.; Sonsaat, S.; Lučić, I.; Silpachai, A.; Zhao, G; Chukharev-Hudilainen, E.; Levis, J.; Gutierrez-Osuna, R. Golden speaker builder – An interactive tool for pronunciation training Journal Article In: Speech Communication, vol. 115, pp. 51-66, 2019. @article{shaojin-2019-speechcomm,
title = {Golden speaker builder – An interactive tool for pronunciation training},
author = {S. Ding and C. Liberatore and S. Sonsaat and I. Lučić and A. Silpachai and G Zhao and E. Chukharev-Hudilainen and J. Levis and R. Gutierrez-Osuna},
url = {https://www.sciencedirect.com/science/article/pii/S0167639319302675
https://psi.engr.tamu.edu/wp-content/uploads/2019/11/1-s2.0-S0167639319302675-main.pdf
},
year = {2019},
date = {2019-11-14},
journal = {Speech Communication},
volume = {115},
pages = {51-66},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {article}
}
|
Hair, A; Ballard, K J; Ahmed, B; Gutierrez-Osuna, R Evaluating Automatic Speech Recognition for Child Speech Therapy Applications Proceedings Article In: ACM SIGACCESS Conference on Computers and Accessibility, ACM 2019, ISBN: 978-1-4503-6676-2/19/10. @inproceedings{hair2019evaluating,
title = {Evaluating Automatic Speech Recognition for Child Speech Therapy Applications},
author = {A Hair and K J Ballard and B Ahmed and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2019/08/hair2019evaluating.pdf},
doi = {10.1145/3308561.3354606},
isbn = {978-1-4503-6676-2/19/10},
year = {2019},
date = {2019-10-28},
booktitle = {ACM SIGACCESS Conference on Computers and Accessibility},
organization = {ACM},
abstract = {Automatic speech recognition (ASR) technology can be a useful tool in mobile apps for child speech therapy, empowering children to complete their practice with limited caregiver supervision. However, little is known about the feasibility of performing ASR on mobile devices, particularly when training data is limited. In this study, we investigated the performance of two low-resource ASR systems on disordered speech from children. We compared the open-source PocketSphinx (PS) recognizer using adapted acoustic models and a custom template-matching (TM) recognizer. TM and the adapted models significantly out-perform the default PS model. On average, maximum likelihood linear regression and maximum a posteriori adaptation increased PS accuracy from 59.4% to 63.8% and 80.0%, respectively, suggesting that the models successfully captured speaker-specific word production variations. TM reached a mean accuracy of 75.8%. },
keywords = {Automatic Speech Recognition, Childhood apraxia of speech, Health, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
Automatic speech recognition (ASR) technology can be a useful tool in mobile apps for child speech therapy, empowering children to complete their practice with limited caregiver supervision. However, little is known about the feasibility of performing ASR on mobile devices, particularly when training data is limited. In this study, we investigated the performance of two low-resource ASR systems on disordered speech from children. We compared the open-source PocketSphinx (PS) recognizer using adapted acoustic models and a custom template-matching (TM) recognizer. TM and the adapted models significantly out-perform the default PS model. On average, maximum likelihood linear regression and maximum a posteriori adaptation increased PS accuracy from 59.4% to 63.8% and 80.0%, respectively, suggesting that the models successfully captured speaker-specific word production variations. TM reached a mean accuracy of 75.8%. |
Ding, S.; Gutierrez-Osuna, Ricardo Group Latent Embedding for Vector Quantized Variational Autoencoder in Non-Parallel Voice Conversion Proceedings Article In: Proc. Interspeech, 2019. @inproceedings{shaojin2019-interspeech,
title = {Group Latent Embedding for Vector Quantized Variational Autoencoder in Non-Parallel Voice Conversion},
author = {S. Ding and Ricardo Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2019/06/ding2019interspeech.pdf},
doi = {10.21437/Interspeech.2019-1198},
year = {2019},
date = {2019-09-15},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Zhao, G; Ding, S.; Gutierrez-Osuna, Ricardo Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams Proceedings Article In: Proc. Interspeech, 2019. @inproceedings{guanlong2019-interspeech,
title = {Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams},
author = {G Zhao and S. Ding and Ricardo Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2019/07/zhao2019interspeech.pdf},
year = {2019},
date = {2019-09-15},
urldate = {2019-09-15},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Deep learning, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Zhao, G; Gutierrez-Osuna, R Using Phonetic Posteriorgram Based Frame Pairing for Segmental Accent Conversion Journal Article In: IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 27, no. 10, pp. 1649-1660, 2019, ISSN: 2329-9290. @article{zhao-2019-taslp,
title = {Using Phonetic Posteriorgram Based Frame Pairing for Segmental Accent Conversion},
author = {G Zhao and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2020/04/zhao2019taslp.pdf},
doi = {10.1109/TASLP.2019.2926754},
issn = {2329-9290},
year = {2019},
date = {2019-07-04},
urldate = {2019-07-04},
journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
volume = {27},
number = {10},
pages = {1649-1660},
abstract = {Accent conversion (AC) aims to transform non-native utterances to sound as if the speaker had a native accent. This can be achieved by mapping source speech spectra from a native speaker into the acoustic space of the target non-native speaker. In prior work, we proposed an AC approach that matches frames between the two speakers based on their acoustic similarity after compensating for differences in vocal tract length. In this paper, we propose a new approach that matches frames between the two speakers based on their phonetic (rather than acoustic) similarity. Namely, we map frames from the two speakers into a phonetic posteriorgram using speaker-independent acoustic models trained on native speech. We thoroughly evaluate the approach on a speech corpus containing multiple native and non-native speakers. The proposed algorithm outperforms the prior approach, improving ratings of acoustic quality (22% increase in mean opinion score) and native accent (69% preference) while retaining the voice quality of the non-native speaker. Furthermore, we show that the approach can be used in the reverse conversion direction, i.e., generating speech with a native speaker's voice quality and a non-native accent. Finally, we show that this approach can be applied to non-parallel training data, achieving the same accent conversion performance.},
keywords = {Accent conversion, Deep learning, Speech},
pubstate = {published},
tppubtype = {article}
}
Accent conversion (AC) aims to transform non-native utterances to sound as if the speaker had a native accent. This can be achieved by mapping source speech spectra from a native speaker into the acoustic space of the target non-native speaker. In prior work, we proposed an AC approach that matches frames between the two speakers based on their acoustic similarity after compensating for differences in vocal tract length. In this paper, we propose a new approach that matches frames between the two speakers based on their phonetic (rather than acoustic) similarity. Namely, we map frames from the two speakers into a phonetic posteriorgram using speaker-independent acoustic models trained on native speech. We thoroughly evaluate the approach on a speech corpus containing multiple native and non-native speakers. The proposed algorithm outperforms the prior approach, improving ratings of acoustic quality (22% increase in mean opinion score) and native accent (69% preference) while retaining the voice quality of the non-native speaker. Furthermore, we show that the approach can be used in the reverse conversion direction, i.e., generating speech with a native speaker's voice quality and a non-native accent. Finally, we show that this approach can be applied to non-parallel training data, achieving the same accent conversion performance. |
Monteiro, C. D. D.; Shipman, F. M.; III, S. Duggina; Gutierrez-Osuna, R. Tradeoffs in the Efficient Detection of Sign Language Content in Video Sharing Sites Journal Article In: ACM Transactions on Accessible Computing, vol. 12, no. 2, pp. 1-16, 2019. @article{caio2019asl,
title = {Tradeoffs in the Efficient Detection of Sign Language Content in Video Sharing Sites},
author = {C. D. D. Monteiro and F. M. Shipman and III, S. Duggina and R. Gutierrez-Osuna},
url = {https://dl.acm.org/doi/10.1145/3325863
https://psi.engr.tamu.edu/wp-content/uploads/2020/04/caio2019asl.pdf},
year = {2019},
date = {2019-06-01},
journal = {ACM Transactions on Accessible Computing},
volume = {12},
number = {2},
pages = {1-16},
keywords = {Computer vision, Speech},
pubstate = {published},
tppubtype = {article}
}
|
2018
|
Ahmed, B; Monroe, P; Hair, A; Tan, C-T; Gutierrez-Osuna, R; Ballard, K J Speech-driven mobile games for speech therapy: User experiences and feasibility Journal Article In: International Journal of Speech-Language Pathology , vol. 20, no. 6, pp. 644-658, 2018. @article{ahmed2018ijslp,
title = {Speech-driven mobile games for speech therapy: User experiences and feasibility},
author = {B Ahmed and P Monroe and A Hair and C-T Tan and R Gutierrez-Osuna and K J Ballard},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2019/06/ahmed-2018-ijslp.pdf
https://doi.org/10.1080/17549507.2018.1513562},
year = {2018},
date = {2018-10-30},
journal = {International Journal of Speech-Language Pathology },
volume = {20},
number = {6},
pages = {644-658},
keywords = {Childhood apraxia of speech, Games, Health, Speech},
pubstate = {published},
tppubtype = {article}
}
|
Levis, J.; Chukharev-Hudilainen, E.; Gutierrez-Osuna, R.; Lucic, I.; Silpachai, A.; Sonsaat, S. Golden Speaker: Learner Experience with Computer-assisted Pronunciation Practice Proceedings Article In: Proc. Pronunciation in Second Language Learning and Teaching Conference, 2018. @inproceedings{levis2018psslt,
title = {Golden Speaker: Learner Experience with Computer-assisted Pronunciation Practice},
author = {J. Levis and E. Chukharev-Hudilainen and R. Gutierrez-Osuna and I. Lucic and A. Silpachai and S. Sonsaat},
year = {2018},
date = {2018-09-06},
booktitle = {Proc. Pronunciation in Second Language Learning and Teaching Conference},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Ding, S.; Zhao, G; Liberatore, C.; Gutierrez-Osuna, R. Improving Sparse Representations in Exemplar-Based Voice Conversion with a Phoneme-Selective Objective Function Proceedings Article In: Proc. Interspeech, 2018. @inproceedings{ding2018interspeech2,
title = {Improving Sparse Representations in Exemplar-Based Voice Conversion with a Phoneme-Selective Objective Function},
author = {S. Ding and G Zhao and C. Liberatore and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/09/ding2018interspeech2.pdf},
doi = {10.21437/Interspeech.2018-1272},
year = {2018},
date = {2018-09-02},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
McKechnie, J.; Ahmed, B.; Gutierrez-Osuna, R.; Monroe, P.; McCabe, P.; Ballard, K. J. Automated speech analysis tools for children’s speech production: A systematic literature review Journal Article In: International Journal of Speech-Language Pathology, vol. 20, no. 6, pp. 583–598, 2018. @article{mcKechnie2018review,
title = {Automated speech analysis tools for children’s speech production: A systematic literature review},
author = {J. McKechnie and B. Ahmed and R. Gutierrez-Osuna and P. Monroe and P. McCabe and K. J. Ballard},
url = {https://doi.org/10.1080/17549507.2018.1477991
https://psi.engr.tamu.edu/wp-content/uploads/2019/06/mckechnie-2018-ijslp.pdf},
year = {2018},
date = {2018-09-02},
journal = {International Journal of Speech-Language Pathology},
volume = {20},
number = {6},
pages = {583–598},
keywords = {Childhood apraxia of speech, Health, Speech},
pubstate = {published},
tppubtype = {article}
}
|
Zhao, G; Sonsaat, S; Silpachai, A; Lucic, I; Chukharev-Hudilainen, E; Levis, J; Gutierrez-Osuna, R L2-ARCTIC: A Non-Native English Speech Corpus Proceedings Article In: Proc. Interspeech, 2018. @inproceedings{zhao2018interspeech,
title = {L2-ARCTIC: A Non-Native English Speech Corpus},
author = {G Zhao and S Sonsaat and A Silpachai and I Lucic and E Chukharev-Hudilainen and J Levis and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/08/zhao2018interspeech.pdf
https://psi.engr.tamu.edu/l2-arctic-corpus/},
year = {2018},
date = {2018-09-02},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Ding, S.; Liberatore, C.; Gutierrez-Osuna, R. Learning Structured Dictionaries for Exemplar-based Voice Conversion Proceedings Article In: Proc. Interspeech, 2018. @inproceedings{ding2018interspeech1,
title = {Learning Structured Dictionaries for Exemplar-based Voice Conversion},
author = {S. Ding and C. Liberatore and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/09/ding2018interspeech1.pdf},
doi = {10.21437/Interspeech.2018-1295},
year = {2018},
date = {2018-09-02},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Hair, A; Monroe, P; Ahmed, B; Ballard, K J; Gutierrez-Osuna, R Apraxia World: A Speech Therapy Game for Children with Speech Sound Disorders Proceedings Article In: Proceedings of the 2018 Conference on Interaction Design and Children, ACM, 2018, ISBN: 978-1-4503-5152-2/18/06. @inproceedings{hair2018idc,
title = {Apraxia World: A Speech Therapy Game for Children with Speech Sound Disorders},
author = {A Hair and P Monroe and B Ahmed and K J Ballard and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/04/hair2018idc.pdf},
doi = {10.1145/3202185.3202733},
isbn = {978-1-4503-5152-2/18/06},
year = {2018},
date = {2018-06-19},
booktitle = {Proceedings of the 2018 Conference on Interaction Design and Children},
publisher = {ACM},
abstract = {This paper presents Apraxia World, a remote therapy tool for speech sound disorders that integrates speech exercises into an engaging platformer-style game. In Apraxia World, the player controls the avatar with virtual buttons/joystick, whereas speech input is associated with assets needed to advance from one level to the next. We tested performance and child preference of two strategies for delivering speech exercises: during each level, and after it. Most children indicated that doing exercises after completing each level was less disruptive and preferable to doing exercises scattered through the level. We also found that children liked having perceived control over the game (character appearance, exercise behavior). Our results indicate that (i) a familiar style of game successfully engages children, (ii) speech exercises function well when decoupled from game control, and (iii) children are willing to complete required speech exercises while playing a game they enjoy.},
keywords = {Childhood apraxia of speech, Health, Mobile computing, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
This paper presents Apraxia World, a remote therapy tool for speech sound disorders that integrates speech exercises into an engaging platformer-style game. In Apraxia World, the player controls the avatar with virtual buttons/joystick, whereas speech input is associated with assets needed to advance from one level to the next. We tested performance and child preference of two strategies for delivering speech exercises: during each level, and after it. Most children indicated that doing exercises after completing each level was less disruptive and preferable to doing exercises scattered through the level. We also found that children liked having perceived control over the game (character appearance, exercise behavior). Our results indicate that (i) a familiar style of game successfully engages children, (ii) speech exercises function well when decoupled from game control, and (iii) children are willing to complete required speech exercises while playing a game they enjoy. |
Liberatore, C; Zhao, G; Gutierrez-Osuna, R Voice Conversion through Residual Warping in a Sparse, Anchor-Based Representation of Speech Proceedings Article In: Proc. ICASSP, 2018. @inproceedings{liberatore2018icassp,
title = {Voice Conversion through Residual Warping in a Sparse, Anchor-Based Representation of Speech},
author = {C Liberatore and G Zhao and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/03/liberatore-icassp2018.pdf},
year = {2018},
date = {2018-04-15},
booktitle = {Proc. ICASSP},
abstract = {In previous work we presented a Sparse, Anchor-Based Representation of speech (SABR) that uses phonemic “anchors” to represent an utterance with a set of sparse non-negative weights. SABR is speaker-independent: combining weights from a source speaker with anchors from a target speaker can be used for voice conversion. Here, we present an extension of the original SABR that significantly improves voice conversion synthesis. Namely, we take the residual signal from the SABR decomposition of the source speaker’s utterance, and warp it to the target speaker’s space using a weighted warping function learned from pairs of source-target
anchors. Using subjective and objective evaluations, we examine the performance of adding the warped residual (SABR+Res) to the original synthesis (SABR). Specifically, listeners rated SABR+Res with an average mean opinion score (MOS) of 3.6, a significant improvement compared to 2.2 MOS for SABR alone (𝑝 < 0.01) and
2.5 MOS for a baseline GMM method (𝑝 < 0.01). In an XAB speaker identity test, listeners correctly identified the identity of SABR+Res (81%) and SABR (84%) as requently as a GMM method (82%) (𝑝 = 0.70, 𝑝 = 0.35). These results indicate that adding the warped residual can dramatically improve synthesis while retaining the
desirable independent qualities of SABR models.},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
In previous work we presented a Sparse, Anchor-Based Representation of speech (SABR) that uses phonemic “anchors” to represent an utterance with a set of sparse non-negative weights. SABR is speaker-independent: combining weights from a source speaker with anchors from a target speaker can be used for voice conversion. Here, we present an extension of the original SABR that significantly improves voice conversion synthesis. Namely, we take the residual signal from the SABR decomposition of the source speaker’s utterance, and warp it to the target speaker’s space using a weighted warping function learned from pairs of source-target
anchors. Using subjective and objective evaluations, we examine the performance of adding the warped residual (SABR+Res) to the original synthesis (SABR). Specifically, listeners rated SABR+Res with an average mean opinion score (MOS) of 3.6, a significant improvement compared to 2.2 MOS for SABR alone (𝑝 < 0.01) and
2.5 MOS for a baseline GMM method (𝑝 < 0.01). In an XAB speaker identity test, listeners correctly identified the identity of SABR+Res (81%) and SABR (84%) as requently as a GMM method (82%) (𝑝 = 0.70, 𝑝 = 0.35). These results indicate that adding the warped residual can dramatically improve synthesis while retaining the
desirable independent qualities of SABR models. |
Zhao, G; Sonsaat, S; Levis, J; Chukharev-Hudilainen, E; Gutierrez-Osuna, R Accent conversion using phonetic posteriorgrams Proceedings Article In: Proc. ICASSP, 2018. @inproceedings{zhao2018icassp,
title = {Accent conversion using phonetic posteriorgrams},
author = {G Zhao and S Sonsaat and J Levis and E Chukharev-Hudilainen and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/03/zhao2018icassp.pdf
http://people.tamu.edu/~guanlong.zhao/icassp18_demo.html
https://psi.engr.tamu.edu/l2-arctic-corpus/
https://github.com/guanlongzhao/ppg-gmm},
year = {2018},
date = {2018-04-15},
urldate = {2018-04-15},
booktitle = {Proc. ICASSP},
keywords = {Accent conversion, Deep learning, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Angello, G.; Zhao, G.; Manam, A. B.; Gutierrez-Osuna, R. Training Behavior of Successful Tacton-Phoneme Learners Proceedings Article In: IEEE Haptics Symposium, 2018. @inproceedings{genna2018tactons,
title = {Training Behavior of Successful Tacton-Phoneme Learners},
author = {G. Angello and G. Zhao and A. B. Manam and R. Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2022/01/AngelloHaptics2018-2.pdf},
year = {2018},
date = {2018-03-25},
booktitle = {IEEE Haptics Symposium},
keywords = {Other, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2017
|
Shipman, F; Duggina, S; Monteiro, C; Gutierrez-Osuna, R Speed-Accuracy Tradeoffs for Detecting Sign Language Content in Video Sharing Sites Proceedings Article In: Proceedings of ACM SIGACCESS Conference on Computers and Accessibility (ASSETS 2017), pp. 185-189, 2017. @inproceedings{shipman2017assets,
title = {Speed-Accuracy Tradeoffs for Detecting Sign Language Content in Video Sharing Sites},
author = {F Shipman and S Duggina and C Monteiro and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/shipman2017assets.pdf},
year = {2017},
date = {2017-11-21},
booktitle = {Proceedings of ACM SIGACCESS Conference on Computers and Accessibility (ASSETS 2017)},
pages = {185-189},
keywords = {Computer vision, Gestures, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Zhao, G; Gutierrez-Osuna, R Exemplar selection methods in voice conversion Proceedings Article In: Proc. 42nd International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 5525-5529, 2017. @inproceedings{zhao-2017-icassp,
title = {Exemplar selection methods in voice conversion},
author = {G Zhao and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/zhao2017icassp.pdf
http://people.tamu.edu/~guanlong.zhao/spring17/icassp17},
doi = {10.1109/ICASSP.2017.7953213},
year = {2017},
date = {2017-03-05},
booktitle = {Proc. 42nd International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages = {5525-5529},
keywords = {Speech, Voice conversion},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2016
|
Aryal, S; Gutierrez-Osuna, R Comparing Articulatory and Acoustic Strategies for Reducing Non-Native Accents Proceedings Article In: Proc. Interspeech, 2016. @inproceedings{aryal-2016-interspeech,
title = {Comparing Articulatory and Acoustic Strategies for Reducing Non-Native Accents},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2016interspeech.pdf},
year = {2016},
date = {2016-09-08},
booktitle = {Proc. Interspeech},
keywords = {Accent conversion, Articulatory synthesis, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Liberatore, C; Gutierrez-Osuna, R Generating Gestural Scores from Acoustics Through a Sparse Anchor-Based Representation of Speech Proceedings Article In: Proc. Interspeech, 2016. @inproceedings{liberatore2016interspeech,
title = {Generating Gestural Scores from Acoustics Through a Sparse Anchor-Based Representation of Speech},
author = {C Liberatore and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/liberatore2016interspeech.pdf},
year = {2016},
date = {2016-09-08},
booktitle = {Proc. Interspeech},
keywords = {Articulatory synthesis, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Shahin, M; Gutierrez-Osuna, R; Ahmed, B Classification of bisyllabic lexical stress patterns in disordered speech using deep learning Proceedings Article In: Proc. International Conference on Acoustics, Speech, and Signal Processing, 2016. @inproceedings{shahin2016icassp,
title = {Classification of bisyllabic lexical stress patterns in disordered speech using deep learning},
author = {M Shahin and R Gutierrez-Osuna and B Ahmed},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/shahin2016icassp.pdf},
year = {2016},
date = {2016-03-20},
booktitle = {Proc. International Conference on Acoustics, Speech, and Signal Processing},
keywords = {Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
McKechnie, J; Ballard, K J; McCabe, P; Murray, E; Lan, T; Gutierrez-Osuna, R; Ahmed, B Influence of type of feedback on effect of tablet-based delivery of intensive speech therapy in children with Childhood Apraxia of Speech Proceedings Article In: Proceedings of the Motor Speech Conference, 2016. @inproceedings{mckechnie-2016-motorspeech,
title = {Influence of type of feedback on effect of tablet-based delivery of intensive speech therapy in children with Childhood Apraxia of Speech},
author = {J McKechnie and K J Ballard and P McCabe and E Murray and T Lan and R Gutierrez-Osuna and B Ahmed},
year = {2016},
date = {2016-03-03},
booktitle = {Proceedings of the Motor Speech Conference},
journal = {Motor Speech Conference},
keywords = {Childhood apraxia of speech, Games, Health, Mobile computing, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Aryal, S; Gutierrez-Osuna, R Data driven articulatory synthesis with deep neural networks Journal Article In: Computer Speech and Language, vol. 36, pp. 260-273, 2016. @article{aryal-2015-cls,
title = {Data driven articulatory synthesis with deep neural networks},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2016csl.pdf},
year = {2016},
date = {2016-03-01},
urldate = {2016-03-01},
journal = {Computer Speech and Language},
volume = {36},
pages = {260-273},
keywords = {Accent conversion, Articulatory synthesis, Deep learning, Speech},
pubstate = {published},
tppubtype = {article}
}
|
2015
|
Parnandi, A; Karappa, V; Lan, T; Shahin, M; McKechnie, J; Ballard, K; Ahmed, B; Gutierrez-Osuna, R Development of a remote therapy tool for childhood apraxia of speech Journal Article In: ACM Transactions on Accessible Computing, vol. 7, no. 3, pp. 10:1-10:23, 2015. @article{parnandi2015taccess,
title = {Development of a remote therapy tool for childhood apraxia of speech},
author = {A Parnandi and V Karappa and T Lan and M Shahin and J McKechnie and K Ballard and B Ahmed and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/parnandi2015taccess.pdf},
year = {2015},
date = {2015-11-01},
journal = {ACM Transactions on Accessible Computing},
volume = {7},
number = {3},
pages = {10:1-10:23},
keywords = {Childhood apraxia of speech, Games, Health, Mobile computing, Speech},
pubstate = {published},
tppubtype = {article}
}
|
Aryal, S; Gutierrez-Osuna, R Articulatory-based conversion of foreign accents with deep neural networks Proceedings Article In: Proc. Interspeech, pp. 3385-3389, 2015. @inproceedings{aryal2015interspeech,
title = {Articulatory-based conversion of foreign accents with deep neural networks},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2015interspeech.pdf},
year = {2015},
date = {2015-09-06},
urldate = {2015-09-06},
booktitle = {Proc. Interspeech},
pages = {3385-3389},
keywords = {Accent conversion, Articulatory synthesis, Deep learning, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Shahin, M; Ahmed, B; Parnandi, A; Karappa, V; McKechnie, J; Ballard, K; Gutierrez-Osuna, R Tabby Talks: an automated tool for the assessment of childhood apraxia of speech Journal Article In: Speech Communication, vol. in press, 2015. @article{shahin2015specom,
title = {Tabby Talks: an automated tool for the assessment of childhood apraxia of speech},
author = {M Shahin and B Ahmed and A Parnandi and V Karappa and J McKechnie and K Ballard and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/shahin2015specom.pdf},
year = {2015},
date = {2015-04-02},
urldate = {2015-04-02},
journal = {Speech Communication},
volume = {in press},
keywords = {Childhood apraxia of speech, Games, Health, Mobile computing, Speech},
pubstate = {published},
tppubtype = {article}
}
|
Aryal, S; Gutierrez-Osuna, R Reduction of non-native accents through statistical parametric articulatory synthesis Journal Article In: Journal of the Acoustical Society of America, vol. 137, no. 1, pp. 433-446, 2015. @article{aryal2015jasa,
title = {Reduction of non-native accents through statistical parametric articulatory synthesis},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2015jasa.pdf},
year = {2015},
date = {2015-01-23},
journal = {Journal of the Acoustical Society of America},
volume = {137},
number = {1},
pages = {433-446},
keywords = {Accent conversion, Articulatory synthesis, Speech},
pubstate = {published},
tppubtype = {article}
}
|
2014
|
Lan, T; Aryal, S; Ahmed, B; Ballard, K; Gutierrez-Osuna, R Flappy Voice: An Interactive Game for Childhood Apraxia of Speech Therapy Proceedings Article In: Proc. CHI-PLAY, 2014. @inproceedings{lan2014chiplay,
title = {Flappy Voice: An Interactive Game for Childhood Apraxia of Speech Therapy},
author = {T Lan and S Aryal and B Ahmed and K Ballard and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/lan2014chiplay.pdf},
year = {2014},
date = {2014-10-19},
booktitle = {Proc. CHI-PLAY},
keywords = {Childhood apraxia of speech, Games, Health, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Shahin, M; Ahmed, B; McKechnie, J; Ballard, K; Gutierrez-Osuna, R A comparison of GMM-HMM and DNN-HMM based pronunciation verification techniques for use in the assessment of childhood apraxia of speech Proceedings Article In: Proc. Interspeech, 2014. @inproceedings{mostafa2014interspeech,
title = {A comparison of GMM-HMM and DNN-HMM based pronunciation verification techniques for use in the assessment of childhood apraxia of speech},
author = {M Shahin and B Ahmed and J McKechnie and K Ballard and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/mostafa2014interspeech.pdf},
year = {2014},
date = {2014-09-14},
booktitle = {Proc. Interspeech},
keywords = {Childhood apraxia of speech, Health, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
McKechnie, J; Ballard, K; McCabe, P; Gutierrez-Osuna, R; Karappa, V; Parnandi, A; Shahin, M; Murray, E; Ahmed, B Tablet-based delivery of intensive speech therapy in children with Childhood Apraxia of Speech - Pilot Phase Proceedings Article In: Speech Pathology Australia National Conference, 2014. @inproceedings{jacqui2013australiaSLPconference,
title = {Tablet-based delivery of intensive speech therapy in children with Childhood Apraxia of Speech - Pilot Phase},
author = {J McKechnie and K Ballard and P McCabe and R Gutierrez-Osuna and V Karappa and A Parnandi and M Shahin and E Murray and B Ahmed},
year = {2014},
date = {2014-05-14},
urldate = {2014-05-14},
booktitle = {Speech Pathology Australia National Conference},
keywords = {Games, Health, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Felps, D; Aryal, S; Gutierrez-Osuna, R Normalization of articulatory data through Procrustes transformations and analysis-by-synthesis Proceedings Article In: Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 3051-3055, 2014. @inproceedings{danielprocrustes2014icassp,
title = {Normalization of articulatory data through Procrustes transformations and analysis-by-synthesis},
author = {D Felps and S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/danielprocrustes2014icassp.pdf},
year = {2014},
date = {2014-05-09},
booktitle = {Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages = {3051-3055},
keywords = {Accent conversion, Articulatory synthesis, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Aryal, S; Gutierrez-Osuna, R Can voice conversion be used to reduce non-native accents Proceedings Article In: Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 7929-7933, 2014. @inproceedings{sandeshaccentconversion2014icassp,
title = {Can voice conversion be used to reduce non-native accents},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/sandeshaccentconversion2014icassp.pdf},
year = {2014},
date = {2014-05-09},
booktitle = {Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages = {7929-7933},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Aryal, S; Gutierrez-Osuna, R Accent conversion through cross-speaker articulatory synthesis Proceedings Article In: Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 7744-7748, 2014. @inproceedings{sandesh2014icassp,
title = {Accent conversion through cross-speaker articulatory synthesis},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/sandesh2014icassp.pdf},
year = {2014},
date = {2014-05-09},
booktitle = {Proc. 39th International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages = {7744-7748},
keywords = {Accent conversion, Articulatory synthesis, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2013
|
Parnandi, A; Karappa, V; Son, Y; Shahin, M; McKechnie, J; Ballard, K; Ahmed, B; Gutierrez-Osuna, R Architecture of an automated therapy tool for childhood apraxia of speech Conference The 15th International ACM SIGACCESS Conference on Computers and Accessibility (ASSETS), 2013. @conference{avinashassets2013,
title = {Architecture of an automated therapy tool for childhood apraxia of speech},
author = {A Parnandi and V Karappa and Y Son and M Shahin and J McKechnie and K Ballard and B Ahmed and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/avinashassets2013.pdf},
year = {2013},
date = {2013-10-21},
urldate = {2013-10-21},
booktitle = {The 15th International ACM SIGACCESS Conference on Computers and Accessibility (ASSETS)},
keywords = {Childhood apraxia of speech, Games, Health, Mobile computing, Speech},
pubstate = {published},
tppubtype = {conference}
}
|
Aryal, S; Felps, D; Gutierrez-Osuna, R Foreign Accent Conversion through Voice Morphing Proceedings Article In: Interspeech, pp. 3077-3081, 2013. @inproceedings{aryal2013interspeech,
title = {Foreign Accent Conversion through Voice Morphing},
author = {S Aryal and D Felps and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2013interspeech.pdf},
year = {2013},
date = {2013-08-25},
booktitle = {Interspeech},
pages = {3077-3081},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Aryal, S; Gutierrez-Osuna, R Articulatory inversion and synthesis: towards articulatory-based modification of speech Proceedings Article In: 38th International Conference on Acoustics, Speech, and Signal Processing (ICASSP), pp. 7952-7956, 2013. @inproceedings{aryal2013icassp,
title = {Articulatory inversion and synthesis: towards articulatory-based modification of speech},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2013icassp.pdf},
year = {2013},
date = {2013-02-28},
booktitle = {38th International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages = {7952-7956},
keywords = {Articulatory inversion, Articulatory synthesis, Speech},
pubstate = {published},
tppubtype = {inproceedings}
}
|
2012
|
Aryal, S; Gutierrez-Osuna, R Articulatory Inversion and Synthesis: Towards Articulatory-Based Modification of Speech Technical Report 2012. @techreport{aryal2012techreport,
title = {Articulatory Inversion and Synthesis: Towards Articulatory-Based Modification of Speech},
author = {S Aryal and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/aryal2012techreport.pdf},
year = {2012},
date = {2012-12-04},
abstract = {Certain speech modifications, such as changes in foreign/regional accents or articulatory styles, are performed more effectively in the articulatory domain than in the acoustic domain. Though measuring articulators is cumbersome, articulatory parameters may be estimated from acoustics through inversion. In this paper, we study the impact on synthesis quality when articulators predicted from acoustics are used in articulatory synthesis. For this purpose, we trained a GMM articulatory synthesizer and drove it with articulators predicted with an RBF-based inversion model. Using inverted instead of measured articulators degraded synthesis quality, as measured through Mel cepstral distortion and subjective tests. However, retraining the synthesizer with predicted articulators not only reversed the effect of errors introduced during inversion but also improved synthesis quality relative to using measured articulators. These results suggest that inverted articulators do not compromise synthesis quality, and open up the possibility of performing speech modification in the articulatory domain through inversion.},
keywords = {Articulatory inversion, Articulatory synthesis, Speech},
pubstate = {published},
tppubtype = {techreport}
}
Certain speech modifications, such as changes in foreign/regional accents or articulatory styles, are performed more effectively in the articulatory domain than in the acoustic domain. Though measuring articulators is cumbersome, articulatory parameters may be estimated from acoustics through inversion. In this paper, we study the impact on synthesis quality when articulators predicted from acoustics are used in articulatory synthesis. For this purpose, we trained a GMM articulatory synthesizer and drove it with articulators predicted with an RBF-based inversion model. Using inverted instead of measured articulators degraded synthesis quality, as measured through Mel cepstral distortion and subjective tests. However, retraining the synthesizer with predicted articulators not only reversed the effect of errors introduced during inversion but also improved synthesis quality relative to using measured articulators. These results suggest that inverted articulators do not compromise synthesis quality, and open up the possibility of performing speech modification in the articulatory domain through inversion. |
Parnandi, A; Son, Y; Shahin, M; Ahmed, B; Gutierrez-Osuna, R Architecture of an Automated Therapy Tool for Childhood Apraxia of Speech Technical Report 2012. @techreport{parnandi2012techreport-2,
title = {Architecture of an Automated Therapy Tool for Childhood Apraxia of Speech},
author = {A Parnandi and Y Son and M Shahin and B Ahmed and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/parnandi2012techreport-2.pdf},
year = {2012},
date = {2012-08-21},
urldate = {2012-08-21},
abstract = {We present a multi-tier architecture for automating the administration of speech therapy to children suffering from apraxia of speech. This architecture follows a client-server model and facilitates task-oriented remote therapeutic training in home settings. The therapy regimen is remotely assigned to the child by a speech therapist based on a standardized protocol. We utilize tablet PCs to provide stimuli to the children and record their speech response. The speech data is then streamed to a back-end server running a specialized speech-processing module to identify errors and quantify the progress of the child. These automated results allow the therapist to closely monitor the performance of each child, provide relevant feedback, and adapt the training program as needed. Our proposed architecture can accommodate a variety of interaction modalities that can serve as a complement to traditional face-to-face speech practice. In this paper we describe the client-server architecture, the middleware tools upon which the system has been built, and the speechprocessing tools for automatically scoring the patients’ speech.},
keywords = {Games, Health, Mobile computing, Speech},
pubstate = {published},
tppubtype = {techreport}
}
We present a multi-tier architecture for automating the administration of speech therapy to children suffering from apraxia of speech. This architecture follows a client-server model and facilitates task-oriented remote therapeutic training in home settings. The therapy regimen is remotely assigned to the child by a speech therapist based on a standardized protocol. We utilize tablet PCs to provide stimuli to the children and record their speech response. The speech data is then streamed to a back-end server running a specialized speech-processing module to identify errors and quantify the progress of the child. These automated results allow the therapist to closely monitor the performance of each child, provide relevant feedback, and adapt the training program as needed. Our proposed architecture can accommodate a variety of interaction modalities that can serve as a complement to traditional face-to-face speech practice. In this paper we describe the client-server architecture, the middleware tools upon which the system has been built, and the speechprocessing tools for automatically scoring the patients’ speech. |
Felps, D; Geng, C; Gutierrez-Osuna, R Foreign accent conversion through concatenative synthesis in the articulatory domain Journal Article In: IEEE Transactions on Audio, Speech and Language Processing, 2012. @article{felps2012taslp,
title = {Foreign accent conversion through concatenative synthesis in the articulatory domain},
author = {D Felps and C Geng and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps2012taslp.pdf},
year = {2012},
date = {2012-01-01},
journal = {IEEE Transactions on Audio, Speech and Language Processing},
abstract = {We propose a concatenative synthesis approach to the problem of foreign accent conversion. The approach consists of replacing the most accented portions of nonnative speech with alternative segments from a corpus of the speaker’s own speech based on their similarity to those from a reference native speaker. We propose and compare two approaches for selecting units, one based on acoustic similarity [e.g., mel frequency cepstral coeffi-cients (MFCCs)] and a second one based on articulatory similarity, as measured through electromagnetic articulography (EMA). Our hypothesis is that articulatory features provide a better metric for linguistic similarity across speakers than acoustic features. To test this hypothesis, we recorded an articulatory-acoustic corpus from a native and a nonnative speaker, and
evaluated the two speech representations (acoustic versus articulatory) through a series of perceptual experiments. Formal listening tests indicate that the approach can achieve a 20% reduction in perceived accent, but also reveal a strong coupling between accent and speaker identity. To address this issue, we disguised original and resynthesized utterances by altering their average pitch and normalizing vocal tract length. An additional listening experiment supports the hypothesis that articulatory features are less speaker dependent than acoustic features.},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {article}
}
We propose a concatenative synthesis approach to the problem of foreign accent conversion. The approach consists of replacing the most accented portions of nonnative speech with alternative segments from a corpus of the speaker’s own speech based on their similarity to those from a reference native speaker. We propose and compare two approaches for selecting units, one based on acoustic similarity [e.g., mel frequency cepstral coeffi-cients (MFCCs)] and a second one based on articulatory similarity, as measured through electromagnetic articulography (EMA). Our hypothesis is that articulatory features provide a better metric for linguistic similarity across speakers than acoustic features. To test this hypothesis, we recorded an articulatory-acoustic corpus from a native and a nonnative speaker, and
evaluated the two speech representations (acoustic versus articulatory) through a series of perceptual experiments. Formal listening tests indicate that the approach can achieve a 20% reduction in perceived accent, but also reveal a strong coupling between accent and speaker identity. To address this issue, we disguised original and resynthesized utterances by altering their average pitch and normalizing vocal tract length. An additional listening experiment supports the hypothesis that articulatory features are less speaker dependent than acoustic features. |
2010
|
Felps, D; Gutierrez-Osuna, R Normalization of Articulatory Data through Procrustes Transformations and Analysis-by-synthesis Technical Report 2010. @techreport{felps2010techreport,
title = {Normalization of Articulatory Data through Procrustes Transformations and Analysis-by-synthesis},
author = {D Felps and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps2010techreport.pdf},
year = {2010},
date = {2010-05-05},
abstract = {We describe and compare three methods that can be used to normalize articulatory data across speakers. The methods seek to explain systematic anatomical differences between a source and target speaker without modifying the articulatory velocities of the source speaker. The first method is the classical Procrustes transform, which allows for a global translation, rotation, and scaling of articulator positions. An extension to the Procrustes transform is presented that allows independent translations of each articulator. The additional parameters provide a 35% increase in articulatory similarity between two speakers when compared to classical Procrustes. The proposed extension is also coupled with a data-driven articulatory synthesizer to select model parameters that best explain the predicted acoustic (rather than articulatory) differences.},
keywords = {Articulatory synthesis, Speech},
pubstate = {published},
tppubtype = {techreport}
}
We describe and compare three methods that can be used to normalize articulatory data across speakers. The methods seek to explain systematic anatomical differences between a source and target speaker without modifying the articulatory velocities of the source speaker. The first method is the classical Procrustes transform, which allows for a global translation, rotation, and scaling of articulator positions. An extension to the Procrustes transform is presented that allows independent translations of each articulator. The additional parameters provide a 35% increase in articulatory similarity between two speakers when compared to classical Procrustes. The proposed extension is also coupled with a data-driven articulatory synthesizer to select model parameters that best explain the predicted acoustic (rather than articulatory) differences. |
Gutierrez-Osuna, R; Felps, D Foreign Accent Conversion through Voice Morphing Technical Report 2010. @techreport{gutierrez2010techreport,
title = {Foreign Accent Conversion through Voice Morphing},
author = {R Gutierrez-Osuna and D Felps},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/gutierrez2010techreport.pdf},
year = {2010},
date = {2010-05-05},
abstract = {We present a voice morphing strategy that can be used to generate a continuum of accent transformations between a foreign speaker and a native speaker. The approach performs a cepstral decomposition of speech into spectral slope and spectral detail. Accent conversions are then generated by combining the spectral slope of the foreign speaker with a morph of the spectral detail of the native speaker. Spectral morphing is achieved by representing the spectral detail through pulse density modulation and averaging pulses in a pair-wise fashion. The technique is evaluated on parallel recordings from two ARCTIC speakers using objective measures of acoustic quality, speaker identity and foreign accent hat have been recently shown to correlate with perceptual results from listening tests.},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {techreport}
}
We present a voice morphing strategy that can be used to generate a continuum of accent transformations between a foreign speaker and a native speaker. The approach performs a cepstral decomposition of speech into spectral slope and spectral detail. Accent conversions are then generated by combining the spectral slope of the foreign speaker with a morph of the spectral detail of the native speaker. Spectral morphing is achieved by representing the spectral detail through pulse density modulation and averaging pulses in a pair-wise fashion. The technique is evaluated on parallel recordings from two ARCTIC speakers using objective measures of acoustic quality, speaker identity and foreign accent hat have been recently shown to correlate with perceptual results from listening tests. |
Felps, D; Geng, C; Berger, M; Richmond, K; Gutierrez-Osuna, R Relying on critical articulators to estimate vocal tract spectra in an articulatory-acoustic database Conference Interspeech, 2010. @conference{felps2010interspeech,
title = {Relying on critical articulators to estimate vocal tract spectra in an articulatory-acoustic database},
author = {D Felps and C Geng and M Berger and K Richmond and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps2010interspeech.pdf},
year = {2010},
date = {2010-01-01},
booktitle = {Interspeech},
abstract = {We present a new phone-dependent feature weighting scheme that can be used to map articulatory configurations (e.g. EMA) onto vocal tract spectra (e.g. MFCC) through table lookup. The approach consists of assigning feature weights according to a feature's ability to predict the acoustic distance between frames. Since an articulator's predictive accuracy is phonedependent (e.g., lip location is a better predictor for bilabial sounds than for palatal sounds), a unique weight vector is found for each phone. Inspection of the weights reveals a correspondence with the expected critical articulators for many phones. The proposed method reduces overall cepstral error by 6% when compared to a uniform weighting scheme. Vowels show the greatest benefit, though improvements occur for 80% of the tested phones.},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {conference}
}
We present a new phone-dependent feature weighting scheme that can be used to map articulatory configurations (e.g. EMA) onto vocal tract spectra (e.g. MFCC) through table lookup. The approach consists of assigning feature weights according to a feature's ability to predict the acoustic distance between frames. Since an articulator's predictive accuracy is phonedependent (e.g., lip location is a better predictor for bilabial sounds than for palatal sounds), a unique weight vector is found for each phone. Inspection of the weights reveals a correspondence with the expected critical articulators for many phones. The proposed method reduces overall cepstral error by 6% when compared to a uniform weighting scheme. Vowels show the greatest benefit, though improvements occur for 80% of the tested phones. |
Felps, D; Gutierrez-Osuna, R Developing objective measures of foreign-accent conversion Journal Article In: Audio, Speech, and Language Processing, IEEE Transactions on, vol. 18, no. 5, pp. 1030–1040, 2010. @article{felps2010talsp,
title = {Developing objective measures of foreign-accent conversion},
author = {D Felps and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps2010talsp.pdf},
year = {2010},
date = {2010-01-01},
journal = {Audio, Speech, and Language Processing, IEEE Transactions on},
volume = {18},
number = {5},
pages = {1030--1040},
publisher = {IEEE},
abstract = {Various methods have recently appeared to transform foreign-accented speech into its native-accented counterpart. Evaluation of these accent conversion methods requires extensive listening tests across a number of perceptual dimensions. This article presents three objective measures that may be used to assess the acoustic quality, degree of foreign accent, and speaker identity of accent-converted utterances. Accent conversion generates novel utterances: those of a foreign speaker with a native accent. Therefore, the acoustic quality in accent conversion cannot be evaluated with conventional measures of spectral distortion, which assume that a clean recording of the speech signal is available for comparison. Here we evaluate a single-ended measure of speech quality, lTV -T recommendation P.563 for narrow-band telephony. We also propose a measure of foreign accent that exploits a weakness of automatic speech recognizers: their sensitivity to foreign accents. Namely, we use phoneme-level match scores given by the HTK recognizer trained on a large number of English American speakers to obtain a measure of native accent. Finally, we propose a measure of speaker identity that projects acoustic vectors (e.g., Mel cepstral, F0) onto the linear discriminant that maximizes separability for a given pair of source and target speakers. The three measures are evaluated on a corpus of accent-converted utterances that had been previously rated through perceptual tests. Our results show that the three measures have a high degree of correlation with their corresponding subjective ratings, suggesting that they may be used to accelerate the development of foreign-accent conversion tools. Applications of these measures in the context of computer assisted pronunciation training and voice conversion are also discussed.},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {article}
}
Various methods have recently appeared to transform foreign-accented speech into its native-accented counterpart. Evaluation of these accent conversion methods requires extensive listening tests across a number of perceptual dimensions. This article presents three objective measures that may be used to assess the acoustic quality, degree of foreign accent, and speaker identity of accent-converted utterances. Accent conversion generates novel utterances: those of a foreign speaker with a native accent. Therefore, the acoustic quality in accent conversion cannot be evaluated with conventional measures of spectral distortion, which assume that a clean recording of the speech signal is available for comparison. Here we evaluate a single-ended measure of speech quality, lTV -T recommendation P.563 for narrow-band telephony. We also propose a measure of foreign accent that exploits a weakness of automatic speech recognizers: their sensitivity to foreign accents. Namely, we use phoneme-level match scores given by the HTK recognizer trained on a large number of English American speakers to obtain a measure of native accent. Finally, we propose a measure of speaker identity that projects acoustic vectors (e.g., Mel cepstral, F0) onto the linear discriminant that maximizes separability for a given pair of source and target speakers. The three measures are evaluated on a corpus of accent-converted utterances that had been previously rated through perceptual tests. Our results show that the three measures have a high degree of correlation with their corresponding subjective ratings, suggesting that they may be used to accelerate the development of foreign-accent conversion tools. Applications of these measures in the context of computer assisted pronunciation training and voice conversion are also discussed. |
2009
|
Pazarloglou, A; Stoleru, R; Gutierrez-Osuna, R High-resolution speech signal reconstruction in wireless sensor networks Conference Consumer Communications and Networking Conference, IEEE 2009. @conference{pazarloglou2009high,
title = {High-resolution speech signal reconstruction in wireless sensor networks},
author = {A Pazarloglou and R Stoleru and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/pazarloglou2009high.pdf},
year = {2009},
date = {2009-01-01},
booktitle = {Consumer Communications and Networking Conference},
pages = {1--5},
organization = {IEEE},
abstract = {Data streaming is an emerging class of applications for sensor networks that has very high bandwidth and processing power requirements. In this paper, a new approach for speech data streaming is proposed, which is based on a distributed scheme. This scheme focuses on balancing the energy consumption among nodes in a sensor network by allowing low- resolution streams from multiple nodes to be fused at a central processing node in order to produce an enhanced resolution speech signal. Simulations and experimental results with real microphone signals are presented.},
keywords = {Speech},
pubstate = {published},
tppubtype = {conference}
}
Data streaming is an emerging class of applications for sensor networks that has very high bandwidth and processing power requirements. In this paper, a new approach for speech data streaming is proposed, which is based on a distributed scheme. This scheme focuses on balancing the energy consumption among nodes in a sensor network by allowing low- resolution streams from multiple nodes to be fused at a central processing node in order to produce an enhanced resolution speech signal. Simulations and experimental results with real microphone signals are presented. |
Felps, D; Bortfeld, H; Gutierrez-Osuna, R Foreign accent conversion in computer assisted pronunciation training Journal Article In: Speech communication, vol. 51, no. 10, pp. 920–932, 2009. @article{felps2009foreign,
title = {Foreign accent conversion in computer assisted pronunciation training},
author = {D Felps and H Bortfeld and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps2009foreign.pdf},
year = {2009},
date = {2009-01-01},
journal = {Speech communication},
volume = {51},
number = {10},
pages = {920--932},
publisher = {Elsevier},
abstract = {Learners of a second language practice their pronunciation by listening to and imitating utterances from native speakers. Recent research has shown that choosing a well-matched native speaker to imitate can have a positive impact on pronunciation training. Here we propose a voice-transformation technique that can be used to generate the (arguably) ideal voice to imitate: the own voice of the learner with a native accent. Our work extends previous research, which suggests that providing learners with prosodically corrected versions of their utterances can be a suitable form of feedback in computer assisted pronunciation training. Our technique provides a conversion of both prosodic and segmental characteristics by means of a pitch-synchronous decomposition of speech into glottal excitation and spectral envelope. We apply the technique to a corpus containing parallel recordings of foreign-accented and native-accented utterances, and validate the resulting accent conversions through a series of perceptual experiments. Our results indicate that the technique can reduce foreign accentedness without significantly altering the voice quality properties of the foreign speaker. Finally, we propose a pedagogical strategy for integrating accent conversion as a form of behavioral shaping in computer assisted pronunciation training.},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {article}
}
Learners of a second language practice their pronunciation by listening to and imitating utterances from native speakers. Recent research has shown that choosing a well-matched native speaker to imitate can have a positive impact on pronunciation training. Here we propose a voice-transformation technique that can be used to generate the (arguably) ideal voice to imitate: the own voice of the learner with a native accent. Our work extends previous research, which suggests that providing learners with prosodically corrected versions of their utterances can be a suitable form of feedback in computer assisted pronunciation training. Our technique provides a conversion of both prosodic and segmental characteristics by means of a pitch-synchronous decomposition of speech into glottal excitation and spectral envelope. We apply the technique to a corpus containing parallel recordings of foreign-accented and native-accented utterances, and validate the resulting accent conversions through a series of perceptual experiments. Our results indicate that the technique can reduce foreign accentedness without significantly altering the voice quality properties of the foreign speaker. Finally, we propose a pedagogical strategy for integrating accent conversion as a form of behavioral shaping in computer assisted pronunciation training. |
2008
|
Felps, D; Bortfeld, H; Gutierrez-Osuna, R Prosodic and segmental factors in foreign-accent conversion Technical Report 2008. @techreport{felps08prosodic,
title = {Prosodic and segmental factors in foreign-accent conversion},
author = {D Felps and H Bortfeld and R Gutierrez-Osuna},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/felps08prosodic.pdf},
year = {2008},
date = {2008-07-11},
abstract = {We propose a signal processing method that transforms foreign-accented speech to resemble its native-accented counterpart. The problem is closely related to voice conversion, except that our method seeks to preserve the organic properties of the foreign speaker’s voice; i.e., only those features which cue foreign-accentedness are to be transformed. Our method operates at two levels: prosodic and segmental. Prosodic transformation is performed by means of time and pitch scaling. Segmental transformation is performed by convolving the foreign speaker’s excitation with the warped spectral envelope of the native speaker. Perceptual results indicate that our model is able to provide a 63% reduction in foreign-accentedness. Multidimensional scaling also shows that the segmental transformation causes the perception of a new speaker to emerge, though the identity of this new speaker is three times closer to the foreign speaker than to the native speaker.},
keywords = {Accent conversion, Speech},
pubstate = {published},
tppubtype = {techreport}
}
We propose a signal processing method that transforms foreign-accented speech to resemble its native-accented counterpart. The problem is closely related to voice conversion, except that our method seeks to preserve the organic properties of the foreign speaker’s voice; i.e., only those features which cue foreign-accentedness are to be transformed. Our method operates at two levels: prosodic and segmental. Prosodic transformation is performed by means of time and pitch scaling. Segmental transformation is performed by convolving the foreign speaker’s excitation with the warped spectral envelope of the native speaker. Perceptual results indicate that our model is able to provide a 63% reduction in foreign-accentedness. Multidimensional scaling also shows that the segmental transformation causes the perception of a new speaker to emerge, though the identity of this new speaker is three times closer to the foreign speaker than to the native speaker. |
Choi, H; Gutierrez-Osuna, R; Choi, S; Choe, Y Kernel oriented discriminant analysis for speaker-independent phoneme spaces Conference International Conference on Pattern Recognition, IEEE 2008. @conference{choi2008kernel,
title = {Kernel oriented discriminant analysis for speaker-independent phoneme spaces},
author = {H Choi and R Gutierrez-Osuna and S Choi and Y Choe},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/choi2008kernel.pdf},
year = {2008},
date = {2008-01-01},
booktitle = {International Conference on Pattern Recognition},
pages = {1--4},
organization = {IEEE},
abstract = {Speaker independent feature extraction is a critical problem in speech recognition. Oriented principal component analysis (OPCA) is a potential solution that can find a subspace robust against noise of the data set. The objective of this paper is to find a speaker-independent subspace by generalizing OPCA in two steps: First, we find a nonlinear subspace with the help of a kernel trick, which we refer to as kernel OPCA. Second, we generalize OPCA to problems with more than two phonemes, which leads to oriented discriminant analysis (ODA). In addition, we equip ODA with the kernel trick again, which we refer to as kernel ODA. The models are tested on the CMU ARCTIC speech database. Our results indicate that our proposed kernel methods can outperform linear OPCA and linear ODA at finding a speaker-independent phoneme space.},
keywords = {Speech},
pubstate = {published},
tppubtype = {conference}
}
Speaker independent feature extraction is a critical problem in speech recognition. Oriented principal component analysis (OPCA) is a potential solution that can find a subspace robust against noise of the data set. The objective of this paper is to find a speaker-independent subspace by generalizing OPCA in two steps: First, we find a nonlinear subspace with the help of a kernel trick, which we refer to as kernel OPCA. Second, we generalize OPCA to problems with more than two phonemes, which leads to oriented discriminant analysis (ODA). In addition, we equip ODA with the kernel trick again, which we refer to as kernel ODA. The models are tested on the CMU ARCTIC speech database. Our results indicate that our proposed kernel methods can outperform linear OPCA and linear ODA at finding a speaker-independent phoneme space. |
2005
|
Gutierrez-Osuna, R; Kakumanu, P; Esposito, A; Garcia, ON; Bojorquez, A; Castillo, JL; Rudomin, I Speech-driven facial animation with realistic dynamics Journal Article In: Multimedia, IEEE Transactions on, vol. 7, no. 1, pp. 33–42, 2005. @article{gutierrez2005tmm,
title = {Speech-driven facial animation with realistic dynamics},
author = {R Gutierrez-Osuna and P Kakumanu and A Esposito and ON Garcia and A Bojorquez and JL Castillo and I Rudomin},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/gutierrez2005tmm.pdf},
year = {2005},
date = {2005-01-01},
journal = {Multimedia, IEEE Transactions on},
volume = {7},
number = {1},
pages = {33--42},
publisher = {IEEE},
abstract = {This work presents an integral system capable of generating animations with realistic dynamics, including the individualized nuances, of three-dimensional (3-D) human faces driven by speech acoustics. The system is capable of capturing short phenomena in the orofacial dynamics of a given speaker by tracking the 3-D location of various MPEG-4 facial points through stereovision. A perceptual transformation of the speech spectral envelope and prosodic cues are combined into an acoustic feature vector to predict 3-D orofacial dynamics by means of a nearest-neighbor algorithm. The Karhunen-Loe´ve transformation is used to identify the principal components of orofacial motion, decoupling perceptually natural components from experimental noise. We also present a highly optimized MPEG-4 compliant player capable of generating audio-synchronized animations at 60 frames/s. The player is based on a pseudo-muscle model augmented with a nonpenetrable ellipsoidal structure to approximate the skull and the jaw. This structure adds a sense of volume that provides more realistic dynamics than existing simplified pseudo-muscle-based approaches, yet it is simple enough to work at the desired frame rate. Experimental results on an audiovisual database of compact TIMIT sentences are presented to illustrate the performance of the complete system.},
keywords = {Facial animation, Speech},
pubstate = {published},
tppubtype = {article}
}
This work presents an integral system capable of generating animations with realistic dynamics, including the individualized nuances, of three-dimensional (3-D) human faces driven by speech acoustics. The system is capable of capturing short phenomena in the orofacial dynamics of a given speaker by tracking the 3-D location of various MPEG-4 facial points through stereovision. A perceptual transformation of the speech spectral envelope and prosodic cues are combined into an acoustic feature vector to predict 3-D orofacial dynamics by means of a nearest-neighbor algorithm. The Karhunen-Loe&acute;ve transformation is used to identify the principal components of orofacial motion, decoupling perceptually natural components from experimental noise. We also present a highly optimized MPEG-4 compliant player capable of generating audio-synchronized animations at 60 frames/s. The player is based on a pseudo-muscle model augmented with a nonpenetrable ellipsoidal structure to approximate the skull and the jaw. This structure adds a sense of volume that provides more realistic dynamics than existing simplified pseudo-muscle-based approaches, yet it is simple enough to work at the desired frame rate. Experimental results on an audiovisual database of compact TIMIT sentences are presented to illustrate the performance of the complete system. |
2001
|
Kakumanu, P; Gutierrez-Osuna, R; Esposito, A; Bryll, R; Goshtasby, A; Garcia, ON Speech driven facial animation Conference Proceedings of the 2001 workshop on Perceptive user interfaces, ACM 2001. @conference{kakumanu2001speech,
title = {Speech driven facial animation},
author = {P Kakumanu and R Gutierrez-Osuna and A Esposito and R Bryll and A Goshtasby and ON Garcia},
url = {https://psi.engr.tamu.edu/wp-content/uploads/2018/01/kakumanu2001speech.pdf},
year = {2001},
date = {2001-01-01},
booktitle = {Proceedings of the 2001 workshop on Perceptive user interfaces},
pages = {1--5},
organization = {ACM},
abstract = {The results reported in this article are an integral part of a larger project aimed at achieving perceptually realistic animations, including the individualized nuances, of three-dimensional human faces driven by speech. The audiovisual system that has been developed for learning the spatio-temporal relationship between speech acoustics and facial animation is described, including video and speech processing, pattern analysis, and MPEG-4 compliant facial animation for a given speaker. In particular, we propose a perceptual transformation of the speech spectral envelope, which is shown to capture the dynamics of articulatory movements. An efficient nearest-neighbor algorithm is used to predict novel articulatory trajectories from the speech dynamics. The results are very promising and suggest a new way to approach the modeling of synthetic lip motion of a given speaker driven by his/her speech. This would also provide clues toward a more general cross-speaker realistic animation.},
keywords = {Facial animation, Speech},
pubstate = {published},
tppubtype = {conference}
}
The results reported in this article are an integral part of a larger project aimed at achieving perceptually realistic animations, including the individualized nuances, of three-dimensional human faces driven by speech. The audiovisual system that has been developed for learning the spatio-temporal relationship between speech acoustics and facial animation is described, including video and speech processing, pattern analysis, and MPEG-4 compliant facial animation for a given speaker. In particular, we propose a perceptual transformation of the speech spectral envelope, which is shown to capture the dynamics of articulatory movements. An efficient nearest-neighbor algorithm is used to predict novel articulatory trajectories from the speech dynamics. The results are very promising and suggest a new way to approach the modeling of synthetic lip motion of a given speaker driven by his/her speech. This would also provide clues toward a more general cross-speaker realistic animation. |