@inproceedings{lin-etal-2024-combating,
title = "Combating Label Sparsity in Short Text Topic Modeling via Nearest Neighbor Augmentation",
author = "Lin, Yang and
Ma, Xinyu and
Gao, Xin and
Li, Ruiqing and
Wang, Yasha and
Chu, Xu",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.817/",
doi = "10.18653/v1/2024.findings-acl.817",
pages = "13762--13774",
abstract = "Extracting semantic topics from short texts presents a significant challenge in the field of data mining. While efforts have been made to mitigate data sparsity issue, the limited length of short documents also results in the absence of semantically relevant words, causing biased evidence lower bound and incomplete labels for likelihood maximization. We refer to this issue as the label sparsity problem. To combat this problem, we propose kNNTM, a neural short text topic model that incorporates a $k$-Nearest-Neighbor-based label completion algorithm by augmenting the reconstruction label with $k$-nearest documents to complement these relevant but unobserved words. Furthermore, seeking a precise reflection of distances between documents, we propose a fused multi-view distances metric that takes both local word similarities and global topic semantics into consideration. Extensive experiments on multiple public short-text datasets show that kNNTM model outperforms the state-of-the-art baseline models and can derive both high-quality topics and document representations."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2024-combating">
<titleInfo>
<title>Combating Label Sparsity in Short Text Topic Modeling via Nearest Neighbor Augmentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyu</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruiqing</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yasha</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Extracting semantic topics from short texts presents a significant challenge in the field of data mining. While efforts have been made to mitigate data sparsity issue, the limited length of short documents also results in the absence of semantically relevant words, causing biased evidence lower bound and incomplete labels for likelihood maximization. We refer to this issue as the label sparsity problem. To combat this problem, we propose kNNTM, a neural short text topic model that incorporates a k-Nearest-Neighbor-based label completion algorithm by augmenting the reconstruction label with k-nearest documents to complement these relevant but unobserved words. Furthermore, seeking a precise reflection of distances between documents, we propose a fused multi-view distances metric that takes both local word similarities and global topic semantics into consideration. Extensive experiments on multiple public short-text datasets show that kNNTM model outperforms the state-of-the-art baseline models and can derive both high-quality topics and document representations.</abstract>
<identifier type="citekey">lin-etal-2024-combating</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.817</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.817/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>13762</start>
<end>13774</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Combating Label Sparsity in Short Text Topic Modeling via Nearest Neighbor Augmentation
%A Lin, Yang
%A Ma, Xinyu
%A Gao, Xin
%A Li, Ruiqing
%A Wang, Yasha
%A Chu, Xu
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F lin-etal-2024-combating
%X Extracting semantic topics from short texts presents a significant challenge in the field of data mining. While efforts have been made to mitigate data sparsity issue, the limited length of short documents also results in the absence of semantically relevant words, causing biased evidence lower bound and incomplete labels for likelihood maximization. We refer to this issue as the label sparsity problem. To combat this problem, we propose kNNTM, a neural short text topic model that incorporates a k-Nearest-Neighbor-based label completion algorithm by augmenting the reconstruction label with k-nearest documents to complement these relevant but unobserved words. Furthermore, seeking a precise reflection of distances between documents, we propose a fused multi-view distances metric that takes both local word similarities and global topic semantics into consideration. Extensive experiments on multiple public short-text datasets show that kNNTM model outperforms the state-of-the-art baseline models and can derive both high-quality topics and document representations.
%R 10.18653/v1/2024.findings-acl.817
%U https://aclanthology.org/2024.findings-acl.817/
%U https://doi.org/10.18653/v1/2024.findings-acl.817
%P 13762-13774
Markdown (Informal)
[Combating Label Sparsity in Short Text Topic Modeling via Nearest Neighbor Augmentation](https://aclanthology.org/2024.findings-acl.817/) (Lin et al., Findings 2024)
ACL