@inproceedings{szekely-etal-2020-augmented,
title = "Augmented Prompt Selection for Evaluation of Spontaneous Speech Synthesis",
author = "Szekely, Eva and
Edlund, Jens and
Gustafson, Joakim",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.782/",
pages = "6368--6374",
language = "eng",
ISBN = "979-10-95546-34-4",
abstract = "By definition, spontaneous speech is unscripted and created on the fly by the speaker. It is dramatically different from read speech, where the words are authored as text before they are spoken. Spontaneous speech is emergent and transient, whereas text read out loud is pre-planned. For this reason, it is unsuitable to evaluate the usability and appropriateness of spontaneous speech synthesis by having it read out written texts sampled from for example newspapers or books. Instead, we need to use transcriptions of speech as the target - something that is much less readily available. In this paper, we introduce Starmap, a tool allowing developers to select a varied, representative set of utterances from a spoken genre, to be used for evaluation of TTS for a given domain. The selection can be done from any speech recording, without the need for transcription. The tool uses interactive visualisation of prosodic features with t-SNE, along with a tree-based algorithm to guide the user through thousands of utterances and ensure coverage of a variety of prompts. A listening test has shown that with a selection of genre-specific utterances, it is possible to show significant differences across genres between two synthetic voices built from spontaneous speech."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="szekely-etal-2020-augmented">
<titleInfo>
<title>Augmented Prompt Selection for Evaluation of Spontaneous Speech Synthesis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eva</namePart>
<namePart type="family">Szekely</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jens</namePart>
<namePart type="family">Edlund</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Gustafson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>By definition, spontaneous speech is unscripted and created on the fly by the speaker. It is dramatically different from read speech, where the words are authored as text before they are spoken. Spontaneous speech is emergent and transient, whereas text read out loud is pre-planned. For this reason, it is unsuitable to evaluate the usability and appropriateness of spontaneous speech synthesis by having it read out written texts sampled from for example newspapers or books. Instead, we need to use transcriptions of speech as the target - something that is much less readily available. In this paper, we introduce Starmap, a tool allowing developers to select a varied, representative set of utterances from a spoken genre, to be used for evaluation of TTS for a given domain. The selection can be done from any speech recording, without the need for transcription. The tool uses interactive visualisation of prosodic features with t-SNE, along with a tree-based algorithm to guide the user through thousands of utterances and ensure coverage of a variety of prompts. A listening test has shown that with a selection of genre-specific utterances, it is possible to show significant differences across genres between two synthetic voices built from spontaneous speech.</abstract>
<identifier type="citekey">szekely-etal-2020-augmented</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.782/</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>6368</start>
<end>6374</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Augmented Prompt Selection for Evaluation of Spontaneous Speech Synthesis
%A Szekely, Eva
%A Edlund, Jens
%A Gustafson, Joakim
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G eng
%F szekely-etal-2020-augmented
%X By definition, spontaneous speech is unscripted and created on the fly by the speaker. It is dramatically different from read speech, where the words are authored as text before they are spoken. Spontaneous speech is emergent and transient, whereas text read out loud is pre-planned. For this reason, it is unsuitable to evaluate the usability and appropriateness of spontaneous speech synthesis by having it read out written texts sampled from for example newspapers or books. Instead, we need to use transcriptions of speech as the target - something that is much less readily available. In this paper, we introduce Starmap, a tool allowing developers to select a varied, representative set of utterances from a spoken genre, to be used for evaluation of TTS for a given domain. The selection can be done from any speech recording, without the need for transcription. The tool uses interactive visualisation of prosodic features with t-SNE, along with a tree-based algorithm to guide the user through thousands of utterances and ensure coverage of a variety of prompts. A listening test has shown that with a selection of genre-specific utterances, it is possible to show significant differences across genres between two synthetic voices built from spontaneous speech.
%U https://aclanthology.org/2020.lrec-1.782/
%P 6368-6374
Markdown (Informal)
[Augmented Prompt Selection for Evaluation of Spontaneous Speech Synthesis](https://aclanthology.org/2020.lrec-1.782/) (Szekely et al., LREC 2020)
ACL