@inproceedings{enarvi-kurimo-2013-studies,
title = "Studies on training text selection for conversational {F}innish language modeling",
author = "Enarvi, Seppo and
Kurimo, Mikko",
editor = "Zhang, Joy Ying",
booktitle = "Proceedings of the 10th International Workshop on Spoken Language Translation: Papers",
month = dec # " 5-6",
year = "2013",
address = "Heidelberg, Germany",
url = "https://aclanthology.org/2013.iwslt-papers.9",
abstract = "Current ASR and MT systems do not operate on conversational Finnish, because training data for colloquial Finnish has not been available. Although speech recognition performance on literary Finnish is already quite good, those systems have very poor baseline performance in conversational speech. Text data for relevant vocabulary and language models can be collected from the Internet, but web data is very noisy and most of it is not helpful for learning good models. Finnish language is highly agglutinative, and written phonetically. Even phonetic reductions and sandhi are often written down in informal discussions. This increases vocabulary size dramatically and causes word-based selection methods to fail. Our selection method explicitly optimizes the perplexity of a subword language model on the development data, and requires only very limited amount of speech transcripts as development data. The language models have been evaluated for speech recognition using a new data set consisting of generic colloquial Finnish.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="enarvi-kurimo-2013-studies">
<titleInfo>
<title>Studies on training text selection for conversational Finnish language modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Seppo</namePart>
<namePart type="family">Enarvi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikko</namePart>
<namePart type="family">Kurimo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2013-dec 5-6</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th International Workshop on Spoken Language Translation: Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joy</namePart>
<namePart type="given">Ying</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<place>
<placeTerm type="text">Heidelberg, Germany</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Current ASR and MT systems do not operate on conversational Finnish, because training data for colloquial Finnish has not been available. Although speech recognition performance on literary Finnish is already quite good, those systems have very poor baseline performance in conversational speech. Text data for relevant vocabulary and language models can be collected from the Internet, but web data is very noisy and most of it is not helpful for learning good models. Finnish language is highly agglutinative, and written phonetically. Even phonetic reductions and sandhi are often written down in informal discussions. This increases vocabulary size dramatically and causes word-based selection methods to fail. Our selection method explicitly optimizes the perplexity of a subword language model on the development data, and requires only very limited amount of speech transcripts as development data. The language models have been evaluated for speech recognition using a new data set consisting of generic colloquial Finnish.</abstract>
<identifier type="citekey">enarvi-kurimo-2013-studies</identifier>
<location>
<url>https://aclanthology.org/2013.iwslt-papers.9</url>
</location>
<part>
<date>2013-dec 5-6</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Studies on training text selection for conversational Finnish language modeling
%A Enarvi, Seppo
%A Kurimo, Mikko
%Y Zhang, Joy Ying
%S Proceedings of the 10th International Workshop on Spoken Language Translation: Papers
%D 2013
%8 dec 5 6
%C Heidelberg, Germany
%F enarvi-kurimo-2013-studies
%X Current ASR and MT systems do not operate on conversational Finnish, because training data for colloquial Finnish has not been available. Although speech recognition performance on literary Finnish is already quite good, those systems have very poor baseline performance in conversational speech. Text data for relevant vocabulary and language models can be collected from the Internet, but web data is very noisy and most of it is not helpful for learning good models. Finnish language is highly agglutinative, and written phonetically. Even phonetic reductions and sandhi are often written down in informal discussions. This increases vocabulary size dramatically and causes word-based selection methods to fail. Our selection method explicitly optimizes the perplexity of a subword language model on the development data, and requires only very limited amount of speech transcripts as development data. The language models have been evaluated for speech recognition using a new data set consisting of generic colloquial Finnish.
%U https://aclanthology.org/2013.iwslt-papers.9
Markdown (Informal)
[Studies on training text selection for conversational Finnish language modeling](https://aclanthology.org/2013.iwslt-papers.9) (Enarvi & Kurimo, IWSLT 2013)
ACL