@inproceedings{szekely-etal-2012-evaluating,
title = "Evaluating expressive speech synthesis from audiobook corpora for conversational phrases",
author = "Sz{\'e}kely, {\'E}va and
Cabral, Joao Paulo and
Abou-Zleikha, Mohamed and
Cahill, Peter and
Carson-Berndsen, Julie",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/864_Paper.pdf",
pages = "3335--3339",
abstract = "Audiobooks are a rich resource of large quantities of natural sounding, highly expressive speech. In our previous research we have shown that it is possible to detect different expressive voice styles represented in a particular audiobook, using unsupervised clustering to group the speech corpus of the audiobook into smaller subsets representing the detected voice styles. These subsets of corpora of different voice styles reflect the various ways a speaker uses their voice to express involvement and affect, or imitate characters. This study is an evaluation of the detection of voice styles in an audiobook in the application of expressive speech synthesis. A further aim of this study is to investigate the usability of audiobooks as a language resource for expressive speech synthesis of utterances of conversational speech. Two evaluations have been carried out to assess the effect of the genre transfer: transmitting expressive speech from read aloud literature to conversational phrases with the application of speech synthesis. The first evaluation revealed that listeners have different voice style preferences for a particular conversational phrase. The second evaluation showed that it is possible for users of speech synthesis systems to learn the characteristics of a voice style well enough to make reliable predictions about what a certain utterance will sound like when synthesised using that voice style.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="szekely-etal-2012-evaluating">
<titleInfo>
<title>Evaluating expressive speech synthesis from audiobook corpora for conversational phrases</title>
</titleInfo>
<name type="personal">
<namePart type="given">Éva</namePart>
<namePart type="family">Székely</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joao</namePart>
<namePart type="given">Paulo</namePart>
<namePart type="family">Cabral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Abou-Zleikha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Cahill</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julie</namePart>
<namePart type="family">Carson-Berndsen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehmet</namePart>
<namePart type="given">Uğur</namePart>
<namePart type="family">Doğan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Istanbul, Turkey</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Audiobooks are a rich resource of large quantities of natural sounding, highly expressive speech. In our previous research we have shown that it is possible to detect different expressive voice styles represented in a particular audiobook, using unsupervised clustering to group the speech corpus of the audiobook into smaller subsets representing the detected voice styles. These subsets of corpora of different voice styles reflect the various ways a speaker uses their voice to express involvement and affect, or imitate characters. This study is an evaluation of the detection of voice styles in an audiobook in the application of expressive speech synthesis. A further aim of this study is to investigate the usability of audiobooks as a language resource for expressive speech synthesis of utterances of conversational speech. Two evaluations have been carried out to assess the effect of the genre transfer: transmitting expressive speech from read aloud literature to conversational phrases with the application of speech synthesis. The first evaluation revealed that listeners have different voice style preferences for a particular conversational phrase. The second evaluation showed that it is possible for users of speech synthesis systems to learn the characteristics of a voice style well enough to make reliable predictions about what a certain utterance will sound like when synthesised using that voice style.</abstract>
<identifier type="citekey">szekely-etal-2012-evaluating</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2012/pdf/864_Paper.pdf</url>
</location>
<part>
<date>2012-05</date>
<extent unit="page">
<start>3335</start>
<end>3339</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating expressive speech synthesis from audiobook corpora for conversational phrases
%A Székely, Éva
%A Cabral, Joao Paulo
%A Abou-Zleikha, Mohamed
%A Cahill, Peter
%A Carson-Berndsen, Julie
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Doğan, Mehmet Uğur
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)
%D 2012
%8 May
%I European Language Resources Association (ELRA)
%C Istanbul, Turkey
%F szekely-etal-2012-evaluating
%X Audiobooks are a rich resource of large quantities of natural sounding, highly expressive speech. In our previous research we have shown that it is possible to detect different expressive voice styles represented in a particular audiobook, using unsupervised clustering to group the speech corpus of the audiobook into smaller subsets representing the detected voice styles. These subsets of corpora of different voice styles reflect the various ways a speaker uses their voice to express involvement and affect, or imitate characters. This study is an evaluation of the detection of voice styles in an audiobook in the application of expressive speech synthesis. A further aim of this study is to investigate the usability of audiobooks as a language resource for expressive speech synthesis of utterances of conversational speech. Two evaluations have been carried out to assess the effect of the genre transfer: transmitting expressive speech from read aloud literature to conversational phrases with the application of speech synthesis. The first evaluation revealed that listeners have different voice style preferences for a particular conversational phrase. The second evaluation showed that it is possible for users of speech synthesis systems to learn the characteristics of a voice style well enough to make reliable predictions about what a certain utterance will sound like when synthesised using that voice style.
%U http://www.lrec-conf.org/proceedings/lrec2012/pdf/864_Paper.pdf
%P 3335-3339
Markdown (Informal)
[Evaluating expressive speech synthesis from audiobook corpora for conversational phrases](http://www.lrec-conf.org/proceedings/lrec2012/pdf/864_Paper.pdf) (Székely et al., LREC 2012)
ACL