@inproceedings{tayir-etal-2024-visual,
title = "Visual Pivoting Unsupervised Multimodal Machine Translation in Low-Resource Distant Language Pairs",
author = "Tayir, Turghun and
Li, Lin and
Tao, Xiaohui and
Maimaiti, Mieradilijiang and
Li, Ming and
Liu, Jianquan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.320/",
doi = "10.18653/v1/2024.findings-emnlp.320",
pages = "5596--5607",
abstract = "Unsupervised multimodal machine translation (UMMT) aims to leverage vision information as a pivot between two languages to achieve better performance on low-resource language pairs. However, there is presently a challenge: how to handle alignment between distant language pairs (DLPs) in UMMT. To this end, this paper proposes a visual pivoting UMMT method for DLPs. Specifically, we first construct a dataset containing two DLPs, including English-Uyghur and Chinese-Uyghur. We then apply the visual pivoting method for both to pre-training and fine-tuning, and we observe that the images on the encoder and decoder of UMMT have noticeable effects on DLPs. Finally, we introduce informative multi-granularity image features to facilitate further alignment of the latent space between the two languages. Experimental results show that the proposed method significantly outperforms several baselines on DLPs and close language pairs (CLPs)."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tayir-etal-2024-visual">
<titleInfo>
<title>Visual Pivoting Unsupervised Multimodal Machine Translation in Low-Resource Distant Language Pairs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Turghun</namePart>
<namePart type="family">Tayir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaohui</namePart>
<namePart type="family">Tao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mieradilijiang</namePart>
<namePart type="family">Maimaiti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianquan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Unsupervised multimodal machine translation (UMMT) aims to leverage vision information as a pivot between two languages to achieve better performance on low-resource language pairs. However, there is presently a challenge: how to handle alignment between distant language pairs (DLPs) in UMMT. To this end, this paper proposes a visual pivoting UMMT method for DLPs. Specifically, we first construct a dataset containing two DLPs, including English-Uyghur and Chinese-Uyghur. We then apply the visual pivoting method for both to pre-training and fine-tuning, and we observe that the images on the encoder and decoder of UMMT have noticeable effects on DLPs. Finally, we introduce informative multi-granularity image features to facilitate further alignment of the latent space between the two languages. Experimental results show that the proposed method significantly outperforms several baselines on DLPs and close language pairs (CLPs).</abstract>
<identifier type="citekey">tayir-etal-2024-visual</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.320</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.320/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>5596</start>
<end>5607</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visual Pivoting Unsupervised Multimodal Machine Translation in Low-Resource Distant Language Pairs
%A Tayir, Turghun
%A Li, Lin
%A Tao, Xiaohui
%A Maimaiti, Mieradilijiang
%A Li, Ming
%A Liu, Jianquan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F tayir-etal-2024-visual
%X Unsupervised multimodal machine translation (UMMT) aims to leverage vision information as a pivot between two languages to achieve better performance on low-resource language pairs. However, there is presently a challenge: how to handle alignment between distant language pairs (DLPs) in UMMT. To this end, this paper proposes a visual pivoting UMMT method for DLPs. Specifically, we first construct a dataset containing two DLPs, including English-Uyghur and Chinese-Uyghur. We then apply the visual pivoting method for both to pre-training and fine-tuning, and we observe that the images on the encoder and decoder of UMMT have noticeable effects on DLPs. Finally, we introduce informative multi-granularity image features to facilitate further alignment of the latent space between the two languages. Experimental results show that the proposed method significantly outperforms several baselines on DLPs and close language pairs (CLPs).
%R 10.18653/v1/2024.findings-emnlp.320
%U https://aclanthology.org/2024.findings-emnlp.320/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.320
%P 5596-5607
Markdown (Informal)
[Visual Pivoting Unsupervised Multimodal Machine Translation in Low-Resource Distant Language Pairs](https://aclanthology.org/2024.findings-emnlp.320/) (Tayir et al., Findings 2024)
ACL