@inproceedings{kumar-etal-2022-chasing,
title = "Chasing the Tail with Domain Generalization: A Case Study on Frequency-Enriched Datasets",
author = "Kumar, Manoj and
Rumshisky, Anna and
Gupta, Rahul",
editor = "He, Yulan and
Ji, Heng and
Li, Sujian and
Liu, Yang and
Chang, Chua-Hui",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = nov,
year = "2022",
address = "Online only",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.aacl-main.1",
doi = "10.18653/v1/2022.aacl-main.1",
pages = "1--11",
abstract = "Natural language understanding (NLU) tasks are typically defined by creating an annotated dataset in which each utterance is encountered once. Such data does not resemble real-world natural language interactions in which certain utterances are encountered frequently, others rarely. For deployed NLU systems this is a vital problem, since the underlying machine learning (ML) models are often fine-tuned on typical NLU data, and then applied to real-world data with a very different distribution. Such systems need to maintain interpretation consistency for both high-frequency utterances and low-frequency utterances. We propose an alternative strategy that explicitly uses utterance frequency in training data to learn models that are more robust to unknown distributions. We present a methodology to simulate utterance usage in two public NLU corpora and create new corpora with head, body and tail segments. We evaluate several methods for joint intent classification and named entity recognition (IC-NER), and use two domain generalization approaches that we adapt to NER. The proposed approaches demonstrate upto 7.02{\%} relative improvement in semantic accuracy over baselines on the tail data. We provide insights as to why the proposed approaches work and show that the reasons for observed improvements do not align with those reported in previous work.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2022-chasing">
<titleInfo>
<title>Chasing the Tail with Domain Generalization: A Case Study on Frequency-Enriched Datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manoj</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rumshisky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chua-Hui</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online only</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Natural language understanding (NLU) tasks are typically defined by creating an annotated dataset in which each utterance is encountered once. Such data does not resemble real-world natural language interactions in which certain utterances are encountered frequently, others rarely. For deployed NLU systems this is a vital problem, since the underlying machine learning (ML) models are often fine-tuned on typical NLU data, and then applied to real-world data with a very different distribution. Such systems need to maintain interpretation consistency for both high-frequency utterances and low-frequency utterances. We propose an alternative strategy that explicitly uses utterance frequency in training data to learn models that are more robust to unknown distributions. We present a methodology to simulate utterance usage in two public NLU corpora and create new corpora with head, body and tail segments. We evaluate several methods for joint intent classification and named entity recognition (IC-NER), and use two domain generalization approaches that we adapt to NER. The proposed approaches demonstrate upto 7.02% relative improvement in semantic accuracy over baselines on the tail data. We provide insights as to why the proposed approaches work and show that the reasons for observed improvements do not align with those reported in previous work.</abstract>
<identifier type="citekey">kumar-etal-2022-chasing</identifier>
<identifier type="doi">10.18653/v1/2022.aacl-main.1</identifier>
<location>
<url>https://aclanthology.org/2022.aacl-main.1</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>1</start>
<end>11</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Chasing the Tail with Domain Generalization: A Case Study on Frequency-Enriched Datasets
%A Kumar, Manoj
%A Rumshisky, Anna
%A Gupta, Rahul
%Y He, Yulan
%Y Ji, Heng
%Y Li, Sujian
%Y Liu, Yang
%Y Chang, Chua-Hui
%S Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)
%D 2022
%8 November
%I Association for Computational Linguistics
%C Online only
%F kumar-etal-2022-chasing
%X Natural language understanding (NLU) tasks are typically defined by creating an annotated dataset in which each utterance is encountered once. Such data does not resemble real-world natural language interactions in which certain utterances are encountered frequently, others rarely. For deployed NLU systems this is a vital problem, since the underlying machine learning (ML) models are often fine-tuned on typical NLU data, and then applied to real-world data with a very different distribution. Such systems need to maintain interpretation consistency for both high-frequency utterances and low-frequency utterances. We propose an alternative strategy that explicitly uses utterance frequency in training data to learn models that are more robust to unknown distributions. We present a methodology to simulate utterance usage in two public NLU corpora and create new corpora with head, body and tail segments. We evaluate several methods for joint intent classification and named entity recognition (IC-NER), and use two domain generalization approaches that we adapt to NER. The proposed approaches demonstrate upto 7.02% relative improvement in semantic accuracy over baselines on the tail data. We provide insights as to why the proposed approaches work and show that the reasons for observed improvements do not align with those reported in previous work.
%R 10.18653/v1/2022.aacl-main.1
%U https://aclanthology.org/2022.aacl-main.1
%U https://doi.org/10.18653/v1/2022.aacl-main.1
%P 1-11
Markdown (Informal)
[Chasing the Tail with Domain Generalization: A Case Study on Frequency-Enriched Datasets](https://aclanthology.org/2022.aacl-main.1) (Kumar et al., AACL-IJCNLP 2022)
ACL