@inproceedings{ji-etal-2024-robust,
title = "Robust Guidance for Unsupervised Data Selection: Capturing Perplexing Named Entities for Domain-Specific Machine Translation",
author = "Ji, Seunghyun and
Sinulingga, Hagai Raja and
Kwon, Darongsae",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.sigul-1.37",
pages = "307--317",
abstract = "Low-resourced data presents a significant challenge for neural machine translation. In most cases, the low-resourced environment is caused by high costs due to the need for domain experts or the lack of language experts. Therefore, identifying the most training-efficient data within an unsupervised setting emerges as a practical strategy. Recent research suggests that such effective data can be identified by selecting {`}appropriately complex data{'} based on its volume, providing strong intuition for unsupervised data selection. However, we have discovered that establishing criteria for unsupervised data selection remains a challenge, as the {`}appropriate level of difficulty{'} may vary depending on the data domain. We introduce a novel unsupervised data selection method named {`}Capturing Perplexing Named Entities,{'} which leverages the maximum inference entropy in translated named entities as a metric for selection. When tested with the {`}Korean-English Parallel Corpus of Specialized Domains,{'} our method served as robust guidance for identifying training-efficient data across different domains, in contrast to existing methods.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ji-etal-2024-robust">
<titleInfo>
<title>Robust Guidance for Unsupervised Data Selection: Capturing Perplexing Named Entities for Domain-Specific Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Seunghyun</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hagai</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Sinulingga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Darongsae</namePart>
<namePart type="family">Kwon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Low-resourced data presents a significant challenge for neural machine translation. In most cases, the low-resourced environment is caused by high costs due to the need for domain experts or the lack of language experts. Therefore, identifying the most training-efficient data within an unsupervised setting emerges as a practical strategy. Recent research suggests that such effective data can be identified by selecting ‘appropriately complex data’ based on its volume, providing strong intuition for unsupervised data selection. However, we have discovered that establishing criteria for unsupervised data selection remains a challenge, as the ‘appropriate level of difficulty’ may vary depending on the data domain. We introduce a novel unsupervised data selection method named ‘Capturing Perplexing Named Entities,’ which leverages the maximum inference entropy in translated named entities as a metric for selection. When tested with the ‘Korean-English Parallel Corpus of Specialized Domains,’ our method served as robust guidance for identifying training-efficient data across different domains, in contrast to existing methods.</abstract>
<identifier type="citekey">ji-etal-2024-robust</identifier>
<location>
<url>https://aclanthology.org/2024.sigul-1.37</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>307</start>
<end>317</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Robust Guidance for Unsupervised Data Selection: Capturing Perplexing Named Entities for Domain-Specific Machine Translation
%A Ji, Seunghyun
%A Sinulingga, Hagai Raja
%A Kwon, Darongsae
%Y Melero, Maite
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F ji-etal-2024-robust
%X Low-resourced data presents a significant challenge for neural machine translation. In most cases, the low-resourced environment is caused by high costs due to the need for domain experts or the lack of language experts. Therefore, identifying the most training-efficient data within an unsupervised setting emerges as a practical strategy. Recent research suggests that such effective data can be identified by selecting ‘appropriately complex data’ based on its volume, providing strong intuition for unsupervised data selection. However, we have discovered that establishing criteria for unsupervised data selection remains a challenge, as the ‘appropriate level of difficulty’ may vary depending on the data domain. We introduce a novel unsupervised data selection method named ‘Capturing Perplexing Named Entities,’ which leverages the maximum inference entropy in translated named entities as a metric for selection. When tested with the ‘Korean-English Parallel Corpus of Specialized Domains,’ our method served as robust guidance for identifying training-efficient data across different domains, in contrast to existing methods.
%U https://aclanthology.org/2024.sigul-1.37
%P 307-317
Markdown (Informal)
[Robust Guidance for Unsupervised Data Selection: Capturing Perplexing Named Entities for Domain-Specific Machine Translation](https://aclanthology.org/2024.sigul-1.37) (Ji et al., SIGUL-WS 2024)
ACL