@inproceedings{wuebker-etal-2014-comparison,
title = "Comparison of data selection techniques for the translation of video lectures",
author = "Wuebker, Joern and
Ney, Hermann and
Mart{\'\i}nez-Villaronga, Adri{\`a} and
Gim{\'e}nez, Adri{\`a} and
Juan, Alfons and
Servan, Christophe and
Dymetman, Marc and
Mirkin, Shachar",
editor = "Al-Onaizan, Yaser and
Simard, Michel",
booktitle = "Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track",
month = oct # " 22-26",
year = "2014",
address = "Vancouver, Canada",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2014.amta-researchers.15",
pages = "193--207",
abstract = "For the task of online translation of scientific video lectures, using huge models is not possible. In order to get smaller and efficient models, we perform data selection. In this paper, we perform a qualitative and quantitative comparison of several data selection techniques, based on cross-entropy and infrequent n-gram criteria. In terms of BLEU, a combination of translation and language model cross-entropy achieves the most stable results. As another important criterion for measuring translation quality in our application, we identify the number of out-of-vocabulary words. Here, infrequent n-gram recovery shows superior performance. Finally, we combine the two selection techniques in order to benefit from both their strengths.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wuebker-etal-2014-comparison">
<titleInfo>
<title>Comparison of data selection techniques for the translation of video lectures</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joern</namePart>
<namePart type="family">Wuebker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hermann</namePart>
<namePart type="family">Ney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrià</namePart>
<namePart type="family">Martínez-Villaronga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrià</namePart>
<namePart type="family">Giménez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alfons</namePart>
<namePart type="family">Juan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christophe</namePart>
<namePart type="family">Servan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Dymetman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shachar</namePart>
<namePart type="family">Mirkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-oct 22-26</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michel</namePart>
<namePart type="family">Simard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Vancouver, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>For the task of online translation of scientific video lectures, using huge models is not possible. In order to get smaller and efficient models, we perform data selection. In this paper, we perform a qualitative and quantitative comparison of several data selection techniques, based on cross-entropy and infrequent n-gram criteria. In terms of BLEU, a combination of translation and language model cross-entropy achieves the most stable results. As another important criterion for measuring translation quality in our application, we identify the number of out-of-vocabulary words. Here, infrequent n-gram recovery shows superior performance. Finally, we combine the two selection techniques in order to benefit from both their strengths.</abstract>
<identifier type="citekey">wuebker-etal-2014-comparison</identifier>
<location>
<url>https://aclanthology.org/2014.amta-researchers.15</url>
</location>
<part>
<date>2014-oct 22-26</date>
<extent unit="page">
<start>193</start>
<end>207</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparison of data selection techniques for the translation of video lectures
%A Wuebker, Joern
%A Ney, Hermann
%A Martínez-Villaronga, Adrià
%A Giménez, Adrià
%A Juan, Alfons
%A Servan, Christophe
%A Dymetman, Marc
%A Mirkin, Shachar
%Y Al-Onaizan, Yaser
%Y Simard, Michel
%S Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track
%D 2014
%8 oct 22 26
%I Association for Machine Translation in the Americas
%C Vancouver, Canada
%F wuebker-etal-2014-comparison
%X For the task of online translation of scientific video lectures, using huge models is not possible. In order to get smaller and efficient models, we perform data selection. In this paper, we perform a qualitative and quantitative comparison of several data selection techniques, based on cross-entropy and infrequent n-gram criteria. In terms of BLEU, a combination of translation and language model cross-entropy achieves the most stable results. As another important criterion for measuring translation quality in our application, we identify the number of out-of-vocabulary words. Here, infrequent n-gram recovery shows superior performance. Finally, we combine the two selection techniques in order to benefit from both their strengths.
%U https://aclanthology.org/2014.amta-researchers.15
%P 193-207
Markdown (Informal)
[Comparison of data selection techniques for the translation of video lectures](https://aclanthology.org/2014.amta-researchers.15) (Wuebker et al., AMTA 2014)
ACL
- Joern Wuebker, Hermann Ney, Adrià Martínez-Villaronga, Adrià Giménez, Alfons Juan, Christophe Servan, Marc Dymetman, and Shachar Mirkin. 2014. Comparison of data selection techniques for the translation of video lectures. In Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track, pages 193–207, Vancouver, Canada. Association for Machine Translation in the Americas.