@inproceedings{lin-etal-2023-train,
title = "How to Train Your Dragon: Diverse Augmentation Towards Generalizable Dense Retrieval",
author = "Lin, Sheng-Chieh and
Asai, Akari and
Li, Minghan and
Oguz, Barlas and
Lin, Jimmy and
Mehdad, Yashar and
Yih, Wen-tau and
Chen, Xilun",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.423",
doi = "10.18653/v1/2023.findings-emnlp.423",
pages = "6385--6400",
abstract = "Various techniques have been developed in recent years to improve dense retrieval (DR), such as unsupervised contrastive learning and pseudo-query generation. Existing DRs, however, often suffer from effectiveness tradeoffs between supervised and zero-shot retrieval, which some argue was due to the limited model capacity. We contradict this hypothesis and show that a generalizable DR can be trained to achieve high accuracy in both supervised and zero-shot retrieval without increasing model size. In particular, we systematically examine the contrastive learning of DRs, under the framework of Data Augmentation (DA). Our study shows that common DA practices such as query augmentation with generative models and pseudo-relevance label creation using a cross-encoder, are often inefficient and sub-optimal. We hence propose a new DA approach with diverse queries and sources of supervision to progressively train a generalizable DR. As a result, DRAGON, our Dense Retriever trained with diverse AuGmentatiON, is the first BERT-base-sized DR to achieve state-of-the-art effectiveness in both supervised and zero-shot evaluations and even competes with models using more complex late interaction.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2023-train">
<titleInfo>
<title>How to Train Your Dragon: Diverse Augmentation Towards Generalizable Dense Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sheng-Chieh</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akari</namePart>
<namePart type="family">Asai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minghan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barlas</namePart>
<namePart type="family">Oguz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jimmy</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yashar</namePart>
<namePart type="family">Mehdad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xilun</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Various techniques have been developed in recent years to improve dense retrieval (DR), such as unsupervised contrastive learning and pseudo-query generation. Existing DRs, however, often suffer from effectiveness tradeoffs between supervised and zero-shot retrieval, which some argue was due to the limited model capacity. We contradict this hypothesis and show that a generalizable DR can be trained to achieve high accuracy in both supervised and zero-shot retrieval without increasing model size. In particular, we systematically examine the contrastive learning of DRs, under the framework of Data Augmentation (DA). Our study shows that common DA practices such as query augmentation with generative models and pseudo-relevance label creation using a cross-encoder, are often inefficient and sub-optimal. We hence propose a new DA approach with diverse queries and sources of supervision to progressively train a generalizable DR. As a result, DRAGON, our Dense Retriever trained with diverse AuGmentatiON, is the first BERT-base-sized DR to achieve state-of-the-art effectiveness in both supervised and zero-shot evaluations and even competes with models using more complex late interaction.</abstract>
<identifier type="citekey">lin-etal-2023-train</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.423</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.423</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>6385</start>
<end>6400</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How to Train Your Dragon: Diverse Augmentation Towards Generalizable Dense Retrieval
%A Lin, Sheng-Chieh
%A Asai, Akari
%A Li, Minghan
%A Oguz, Barlas
%A Lin, Jimmy
%A Mehdad, Yashar
%A Yih, Wen-tau
%A Chen, Xilun
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F lin-etal-2023-train
%X Various techniques have been developed in recent years to improve dense retrieval (DR), such as unsupervised contrastive learning and pseudo-query generation. Existing DRs, however, often suffer from effectiveness tradeoffs between supervised and zero-shot retrieval, which some argue was due to the limited model capacity. We contradict this hypothesis and show that a generalizable DR can be trained to achieve high accuracy in both supervised and zero-shot retrieval without increasing model size. In particular, we systematically examine the contrastive learning of DRs, under the framework of Data Augmentation (DA). Our study shows that common DA practices such as query augmentation with generative models and pseudo-relevance label creation using a cross-encoder, are often inefficient and sub-optimal. We hence propose a new DA approach with diverse queries and sources of supervision to progressively train a generalizable DR. As a result, DRAGON, our Dense Retriever trained with diverse AuGmentatiON, is the first BERT-base-sized DR to achieve state-of-the-art effectiveness in both supervised and zero-shot evaluations and even competes with models using more complex late interaction.
%R 10.18653/v1/2023.findings-emnlp.423
%U https://aclanthology.org/2023.findings-emnlp.423
%U https://doi.org/10.18653/v1/2023.findings-emnlp.423
%P 6385-6400
Markdown (Informal)
[How to Train Your Dragon: Diverse Augmentation Towards Generalizable Dense Retrieval](https://aclanthology.org/2023.findings-emnlp.423) (Lin et al., Findings 2023)
ACL