@inproceedings{marwah-beel-2020-term,
title = "Term-Recency for {TF}-{IDF}, {BM}25 and {USE} Term Weighting",
author = "Marwah, Divyanshu and
Beel, Joeran",
editor = "Knoth, Petr and
Stahl, Christopher and
Gyawali, Bikash and
Pride, David and
Kunnath, Suchetha N. and
Herrmannova, Drahomira",
booktitle = "Proceedings of the 8th International Workshop on Mining Scientific Publications",
month = "05 " # aug,
year = "2020",
address = "Wuhan, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wosp-1.5/",
pages = "36--41",
abstract = "Effectiveness of a recommendation in an Information Retrieval (IR) system is determined by relevancy scores of retrieved results. Term weighting is responsible for computing the relevance scores and consequently differentiating between the terms in a document. However, the current term weighting formula (TF-IDF, for instance), weighs terms only based on term frequency and inverse document frequency irrespective of other important factors. This results in ambiguity in cases when both TF and IDF values the same for more than one document, hence resulting in same TF-IDF values. In this paper, we propose a modification of TF-IDF and other term-weighting schemes that weighs the terms based on the recency and the usage in the corpus. We have tested the performance of our algorithm with existing term weighting schemes; TF-IDF, BM25 and USE text embedding model. We have indexed three different datasets with different domains to validate the premises for our algorithm. On evaluating the algorithms using Precision, Recall, F1 score, and NDCG, we found that time normalized TF-IDF outperformed the classic TF-IDF with a significant difference in all the metrics and datasets. Time-based USE model performed better than the standard USE model in two out of three datasets. But the time-based BM25 model did not perform well in some of the input queries as compared to standard BM25 model."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="marwah-beel-2020-term">
<titleInfo>
<title>Term-Recency for TF-IDF, BM25 and USE Term Weighting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Divyanshu</namePart>
<namePart type="family">Marwah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joeran</namePart>
<namePart type="family">Beel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05 aug</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th International Workshop on Mining Scientific Publications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Petr</namePart>
<namePart type="family">Knoth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Stahl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bikash</namePart>
<namePart type="family">Gyawali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Pride</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suchetha</namePart>
<namePart type="given">N</namePart>
<namePart type="family">Kunnath</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Drahomira</namePart>
<namePart type="family">Herrmannova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Wuhan, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Effectiveness of a recommendation in an Information Retrieval (IR) system is determined by relevancy scores of retrieved results. Term weighting is responsible for computing the relevance scores and consequently differentiating between the terms in a document. However, the current term weighting formula (TF-IDF, for instance), weighs terms only based on term frequency and inverse document frequency irrespective of other important factors. This results in ambiguity in cases when both TF and IDF values the same for more than one document, hence resulting in same TF-IDF values. In this paper, we propose a modification of TF-IDF and other term-weighting schemes that weighs the terms based on the recency and the usage in the corpus. We have tested the performance of our algorithm with existing term weighting schemes; TF-IDF, BM25 and USE text embedding model. We have indexed three different datasets with different domains to validate the premises for our algorithm. On evaluating the algorithms using Precision, Recall, F1 score, and NDCG, we found that time normalized TF-IDF outperformed the classic TF-IDF with a significant difference in all the metrics and datasets. Time-based USE model performed better than the standard USE model in two out of three datasets. But the time-based BM25 model did not perform well in some of the input queries as compared to standard BM25 model.</abstract>
<identifier type="citekey">marwah-beel-2020-term</identifier>
<location>
<url>https://aclanthology.org/2020.wosp-1.5/</url>
</location>
<part>
<date>2020-05 aug</date>
<extent unit="page">
<start>36</start>
<end>41</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Term-Recency for TF-IDF, BM25 and USE Term Weighting
%A Marwah, Divyanshu
%A Beel, Joeran
%Y Knoth, Petr
%Y Stahl, Christopher
%Y Gyawali, Bikash
%Y Pride, David
%Y Kunnath, Suchetha N.
%Y Herrmannova, Drahomira
%S Proceedings of the 8th International Workshop on Mining Scientific Publications
%D 2020
%8 05 aug
%I Association for Computational Linguistics
%C Wuhan, China
%F marwah-beel-2020-term
%X Effectiveness of a recommendation in an Information Retrieval (IR) system is determined by relevancy scores of retrieved results. Term weighting is responsible for computing the relevance scores and consequently differentiating between the terms in a document. However, the current term weighting formula (TF-IDF, for instance), weighs terms only based on term frequency and inverse document frequency irrespective of other important factors. This results in ambiguity in cases when both TF and IDF values the same for more than one document, hence resulting in same TF-IDF values. In this paper, we propose a modification of TF-IDF and other term-weighting schemes that weighs the terms based on the recency and the usage in the corpus. We have tested the performance of our algorithm with existing term weighting schemes; TF-IDF, BM25 and USE text embedding model. We have indexed three different datasets with different domains to validate the premises for our algorithm. On evaluating the algorithms using Precision, Recall, F1 score, and NDCG, we found that time normalized TF-IDF outperformed the classic TF-IDF with a significant difference in all the metrics and datasets. Time-based USE model performed better than the standard USE model in two out of three datasets. But the time-based BM25 model did not perform well in some of the input queries as compared to standard BM25 model.
%U https://aclanthology.org/2020.wosp-1.5/
%P 36-41
Markdown (Informal)
[Term-Recency for TF-IDF, BM25 and USE Term Weighting](https://aclanthology.org/2020.wosp-1.5/) (Marwah & Beel, WOSP 2020)
ACL