@inproceedings{nguyen-sakti-2024-multilingual,
title = "Multilingual Self-supervised Visually Grounded Speech Models",
author = "Nguyen, Huynh Phuong Thanh and
Sakti, Sakriani",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.sigul-1.28",
pages = "237--243",
abstract = "Developing a multilingual speech-to-speech translation system poses challenges due to the scarcity of paired speech data in various languages, particularly when dealing with unknown and untranscribed languages. However, the shared semantic representation across multiple languages presents an opportunity to build a translation system based on images. Recently, researchers have explored methods for aligning bilingual speech as a novel approach to discovering speech pairs using semantic images from unknown and untranscribed speech. These aligned speech pairs can then be utilized to train speech-to-speech translation systems. Our research builds upon these approaches by expanding into multiple languages and focusing on achieving multimodal multilingual pairs alignment, with a key component being multilingual visually grounded speech models. The objectives of our research are twofold: (1) to create visually grounded speech datasets for English, Japanese, Indonesian, and Vietnamese, and (2) to develop self-supervised visually grounded speech models for these languages. Our experiments have demonstrated the feasibility of this approach, showcasing the ability to retrieve associations between speeches and images. The results indicate that our multilingual visually grounded speech models yield promising outcomes in representing speeches using semantic images across multiple languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-sakti-2024-multilingual">
<titleInfo>
<title>Multilingual Self-supervised Visually Grounded Speech Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Huynh</namePart>
<namePart type="given">Phuong</namePart>
<namePart type="given">Thanh</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Developing a multilingual speech-to-speech translation system poses challenges due to the scarcity of paired speech data in various languages, particularly when dealing with unknown and untranscribed languages. However, the shared semantic representation across multiple languages presents an opportunity to build a translation system based on images. Recently, researchers have explored methods for aligning bilingual speech as a novel approach to discovering speech pairs using semantic images from unknown and untranscribed speech. These aligned speech pairs can then be utilized to train speech-to-speech translation systems. Our research builds upon these approaches by expanding into multiple languages and focusing on achieving multimodal multilingual pairs alignment, with a key component being multilingual visually grounded speech models. The objectives of our research are twofold: (1) to create visually grounded speech datasets for English, Japanese, Indonesian, and Vietnamese, and (2) to develop self-supervised visually grounded speech models for these languages. Our experiments have demonstrated the feasibility of this approach, showcasing the ability to retrieve associations between speeches and images. The results indicate that our multilingual visually grounded speech models yield promising outcomes in representing speeches using semantic images across multiple languages.</abstract>
<identifier type="citekey">nguyen-sakti-2024-multilingual</identifier>
<location>
<url>https://aclanthology.org/2024.sigul-1.28</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>237</start>
<end>243</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multilingual Self-supervised Visually Grounded Speech Models
%A Nguyen, Huynh Phuong Thanh
%A Sakti, Sakriani
%Y Melero, Maite
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F nguyen-sakti-2024-multilingual
%X Developing a multilingual speech-to-speech translation system poses challenges due to the scarcity of paired speech data in various languages, particularly when dealing with unknown and untranscribed languages. However, the shared semantic representation across multiple languages presents an opportunity to build a translation system based on images. Recently, researchers have explored methods for aligning bilingual speech as a novel approach to discovering speech pairs using semantic images from unknown and untranscribed speech. These aligned speech pairs can then be utilized to train speech-to-speech translation systems. Our research builds upon these approaches by expanding into multiple languages and focusing on achieving multimodal multilingual pairs alignment, with a key component being multilingual visually grounded speech models. The objectives of our research are twofold: (1) to create visually grounded speech datasets for English, Japanese, Indonesian, and Vietnamese, and (2) to develop self-supervised visually grounded speech models for these languages. Our experiments have demonstrated the feasibility of this approach, showcasing the ability to retrieve associations between speeches and images. The results indicate that our multilingual visually grounded speech models yield promising outcomes in representing speeches using semantic images across multiple languages.
%U https://aclanthology.org/2024.sigul-1.28
%P 237-243
Markdown (Informal)
[Multilingual Self-supervised Visually Grounded Speech Models](https://aclanthology.org/2024.sigul-1.28) (Nguyen & Sakti, SIGUL-WS 2024)
ACL