@inproceedings{sarioglu-kayi-etal-2020-multiseg,
title = "{M}ulti{S}eg: Parallel Data and Subword Information for Learning Bilingual Embeddings in Low Resource Scenarios",
author = "Sarioglu Kayi, Efsun and
Anand, Vishal and
Muresan, Smaranda",
editor = "Beermann, Dorothee and
Besacier, Laurent and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources association",
url = "https://aclanthology.org/2020.sltu-1.13/",
pages = "97--105",
language = "eng",
ISBN = "979-10-95546-35-1",
abstract = "Distributed word embeddings have become ubiquitous in natural language processing as they have been shown to improve performance in many semantic and syntactic tasks. Popular models for learning cross-lingual word embeddings do not consider the morphology of words. We propose an approach to learn bilingual embeddings using parallel data and subword information that is expressed in various forms, i.e. character n-grams, morphemes obtained by unsupervised morphological segmentation and byte pair encoding. We report results for three low resource morphologically rich languages (Swahili, Tagalog, and Somali) and a high resource language (German) in a simulated a low-resource scenario. Our results show that our method that leverages subword information outperforms the model without subword information, both in intrinsic and extrinsic evaluations of the learned embeddings. Specifically, analogy reasoning results show that using subwords helps capture syntactic characteristics. Semantically, word similarity results and intrinsically, word translation scores demonstrate superior performance over existing methods. Finally, qualitative analysis also shows better-quality cross-lingual embeddings particularly for morphological variants in both languages."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sarioglu-kayi-etal-2020-multiseg">
<titleInfo>
<title>MultiSeg: Parallel Data and Subword Information for Learning Bilingual Embeddings in Low Resource Scenarios</title>
</titleInfo>
<name type="personal">
<namePart type="given">Efsun</namePart>
<namePart type="family">Sarioglu Kayi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vishal</namePart>
<namePart type="family">Anand</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dorothee</namePart>
<namePart type="family">Beermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Besacier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-35-1</identifier>
</relatedItem>
<abstract>Distributed word embeddings have become ubiquitous in natural language processing as they have been shown to improve performance in many semantic and syntactic tasks. Popular models for learning cross-lingual word embeddings do not consider the morphology of words. We propose an approach to learn bilingual embeddings using parallel data and subword information that is expressed in various forms, i.e. character n-grams, morphemes obtained by unsupervised morphological segmentation and byte pair encoding. We report results for three low resource morphologically rich languages (Swahili, Tagalog, and Somali) and a high resource language (German) in a simulated a low-resource scenario. Our results show that our method that leverages subword information outperforms the model without subword information, both in intrinsic and extrinsic evaluations of the learned embeddings. Specifically, analogy reasoning results show that using subwords helps capture syntactic characteristics. Semantically, word similarity results and intrinsically, word translation scores demonstrate superior performance over existing methods. Finally, qualitative analysis also shows better-quality cross-lingual embeddings particularly for morphological variants in both languages.</abstract>
<identifier type="citekey">sarioglu-kayi-etal-2020-multiseg</identifier>
<location>
<url>https://aclanthology.org/2020.sltu-1.13/</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>97</start>
<end>105</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MultiSeg: Parallel Data and Subword Information for Learning Bilingual Embeddings in Low Resource Scenarios
%A Sarioglu Kayi, Efsun
%A Anand, Vishal
%A Muresan, Smaranda
%Y Beermann, Dorothee
%Y Besacier, Laurent
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)
%D 2020
%8 May
%I European Language Resources association
%C Marseille, France
%@ 979-10-95546-35-1
%G eng
%F sarioglu-kayi-etal-2020-multiseg
%X Distributed word embeddings have become ubiquitous in natural language processing as they have been shown to improve performance in many semantic and syntactic tasks. Popular models for learning cross-lingual word embeddings do not consider the morphology of words. We propose an approach to learn bilingual embeddings using parallel data and subword information that is expressed in various forms, i.e. character n-grams, morphemes obtained by unsupervised morphological segmentation and byte pair encoding. We report results for three low resource morphologically rich languages (Swahili, Tagalog, and Somali) and a high resource language (German) in a simulated a low-resource scenario. Our results show that our method that leverages subword information outperforms the model without subword information, both in intrinsic and extrinsic evaluations of the learned embeddings. Specifically, analogy reasoning results show that using subwords helps capture syntactic characteristics. Semantically, word similarity results and intrinsically, word translation scores demonstrate superior performance over existing methods. Finally, qualitative analysis also shows better-quality cross-lingual embeddings particularly for morphological variants in both languages.
%U https://aclanthology.org/2020.sltu-1.13/
%P 97-105
Markdown (Informal)
[MultiSeg: Parallel Data and Subword Information for Learning Bilingual Embeddings in Low Resource Scenarios](https://aclanthology.org/2020.sltu-1.13/) (Sarioglu Kayi et al., SLTU 2020)
ACL