@article{adlakha-etal-2024-evaluating,
title = "Evaluating Correctness and Faithfulness of Instruction-Following Models for Question Answering",
author = "Adlakha, Vaibhav and
BehnamGhader, Parishad and
Lu, Xing Han and
Meade, Nicholas and
Reddy, Siva",
journal = "Transactions of the Association for Computational Linguistics",
volume = "12",
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.tacl-1.38/",
doi = "10.1162/tacl_a_00667",
pages = "681--699",
abstract = "Instruction-following models are attractive alternatives to fine-tuned approaches for question answering (QA). By simply prepending relevant documents and an instruction to their input, these models can be adapted to various information domains and tasks without additional training. However, these models tend to produce verbose responses with supplementary information, which makes traditional QA metrics like exact match (EM) and F1 unreliable for accurately quantifying model performance. In this work, we evaluate instruction-following models along two fronts: 1) how well they satisfy user`s information need (correctness), and 2) whether they disseminate information supported by the provided knowledge (faithfulness). Guided by human evaluation and analysis, we highlight the shortcomings of traditional metrics for both correctness and faithfulness and propose simple token-overlap metrics that correlate highly with human judgments. Our analysis reveals that for correctness, instruction-following models perform comparably to models specifically fine-tuned for that task. However, they struggle to accurately judge the relevance of the provided knowledge and often hallucinate in their responses. We hope our work encourages more holistic evaluation of instruction-following models for QA. Our code and human annotation data is available at https://github.com/McGill-NLP/instruct-qa."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="adlakha-etal-2024-evaluating">
<titleInfo>
<title>Evaluating Correctness and Faithfulness of Instruction-Following Models for Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vaibhav</namePart>
<namePart type="family">Adlakha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parishad</namePart>
<namePart type="family">BehnamGhader</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xing</namePart>
<namePart type="given">Han</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Meade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siva</namePart>
<namePart type="family">Reddy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Instruction-following models are attractive alternatives to fine-tuned approaches for question answering (QA). By simply prepending relevant documents and an instruction to their input, these models can be adapted to various information domains and tasks without additional training. However, these models tend to produce verbose responses with supplementary information, which makes traditional QA metrics like exact match (EM) and F1 unreliable for accurately quantifying model performance. In this work, we evaluate instruction-following models along two fronts: 1) how well they satisfy user‘s information need (correctness), and 2) whether they disseminate information supported by the provided knowledge (faithfulness). Guided by human evaluation and analysis, we highlight the shortcomings of traditional metrics for both correctness and faithfulness and propose simple token-overlap metrics that correlate highly with human judgments. Our analysis reveals that for correctness, instruction-following models perform comparably to models specifically fine-tuned for that task. However, they struggle to accurately judge the relevance of the provided knowledge and often hallucinate in their responses. We hope our work encourages more holistic evaluation of instruction-following models for QA. Our code and human annotation data is available at https://github.com/McGill-NLP/instruct-qa.</abstract>
<identifier type="citekey">adlakha-etal-2024-evaluating</identifier>
<identifier type="doi">10.1162/tacl_a_00667</identifier>
<location>
<url>https://aclanthology.org/2024.tacl-1.38/</url>
</location>
<part>
<date>2024</date>
<detail type="volume"><number>12</number></detail>
<extent unit="page">
<start>681</start>
<end>699</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Evaluating Correctness and Faithfulness of Instruction-Following Models for Question Answering
%A Adlakha, Vaibhav
%A BehnamGhader, Parishad
%A Lu, Xing Han
%A Meade, Nicholas
%A Reddy, Siva
%J Transactions of the Association for Computational Linguistics
%D 2024
%V 12
%I MIT Press
%C Cambridge, MA
%F adlakha-etal-2024-evaluating
%X Instruction-following models are attractive alternatives to fine-tuned approaches for question answering (QA). By simply prepending relevant documents and an instruction to their input, these models can be adapted to various information domains and tasks without additional training. However, these models tend to produce verbose responses with supplementary information, which makes traditional QA metrics like exact match (EM) and F1 unreliable for accurately quantifying model performance. In this work, we evaluate instruction-following models along two fronts: 1) how well they satisfy user‘s information need (correctness), and 2) whether they disseminate information supported by the provided knowledge (faithfulness). Guided by human evaluation and analysis, we highlight the shortcomings of traditional metrics for both correctness and faithfulness and propose simple token-overlap metrics that correlate highly with human judgments. Our analysis reveals that for correctness, instruction-following models perform comparably to models specifically fine-tuned for that task. However, they struggle to accurately judge the relevance of the provided knowledge and often hallucinate in their responses. We hope our work encourages more holistic evaluation of instruction-following models for QA. Our code and human annotation data is available at https://github.com/McGill-NLP/instruct-qa.
%R 10.1162/tacl_a_00667
%U https://aclanthology.org/2024.tacl-1.38/
%U https://doi.org/10.1162/tacl_a_00667
%P 681-699
Markdown (Informal)
[Evaluating Correctness and Faithfulness of Instruction-Following Models for Question Answering](https://aclanthology.org/2024.tacl-1.38/) (Adlakha et al., TACL 2024)
ACL