@inproceedings{otegi-etal-2020-automatic,
title = "Automatic Evaluation vs. User Preference in Neural Textual {Q}uestion{A}nswering over {COVID}-19 Scientific Literature",
author = "Otegi, Arantxa and
Campos, Jon Ander and
Azkune, Gorka and
Soroa, Aitor and
Agirre, Eneko",
editor = "Verspoor, Karin and
Cohen, Kevin Bretonnel and
Conway, Michael and
de Bruijn, Berry and
Dredze, Mark and
Mihalcea, Rada and
Wallace, Byron",
booktitle = "Proceedings of the 1st Workshop on {NLP} for {COVID}-19 (Part 2) at {EMNLP} 2020",
month = dec,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.nlpcovid19-2.15",
doi = "10.18653/v1/2020.nlpcovid19-2.15",
abstract = "We present a Question Answering (QA) system that won one of the tasks of the Kaggle CORD-19 Challenge, according to the qualitative evaluation of experts. The system is a combination of an Information Retrieval module and a reading comprehension module that finds the answers in the retrieved passages. In this paper we present a quantitative and qualitative analysis of the system. The quantitative evaluation using manually annotated datasets contradicted some of our design choices, e.g. the fact that using QuAC for fine-tuning provided better answers over just using SQuAD. We analyzed this mismatch with an additional A/B test which showed that the system using QuAC was indeed preferred by users, confirming our intuition. Our analysis puts in question the suitability of automatic metrics and its correlation to user preferences. We also show that automatic metrics are highly dependent on the characteristics of the gold standard, such as the average length of the answers.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="otegi-etal-2020-automatic">
<titleInfo>
<title>Automatic Evaluation vs. User Preference in Neural Textual QuestionAnswering over COVID-19 Scientific Literature</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arantxa</namePart>
<namePart type="family">Otegi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="given">Ander</namePart>
<namePart type="family">Campos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gorka</namePart>
<namePart type="family">Azkune</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aitor</namePart>
<namePart type="family">Soroa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eneko</namePart>
<namePart type="family">Agirre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on NLP for COVID-19 (Part 2) at EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Karin</namePart>
<namePart type="family">Verspoor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="given">Bretonnel</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Conway</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Berry</namePart>
<namePart type="family">de Bruijn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Dredze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rada</namePart>
<namePart type="family">Mihalcea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Byron</namePart>
<namePart type="family">Wallace</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a Question Answering (QA) system that won one of the tasks of the Kaggle CORD-19 Challenge, according to the qualitative evaluation of experts. The system is a combination of an Information Retrieval module and a reading comprehension module that finds the answers in the retrieved passages. In this paper we present a quantitative and qualitative analysis of the system. The quantitative evaluation using manually annotated datasets contradicted some of our design choices, e.g. the fact that using QuAC for fine-tuning provided better answers over just using SQuAD. We analyzed this mismatch with an additional A/B test which showed that the system using QuAC was indeed preferred by users, confirming our intuition. Our analysis puts in question the suitability of automatic metrics and its correlation to user preferences. We also show that automatic metrics are highly dependent on the characteristics of the gold standard, such as the average length of the answers.</abstract>
<identifier type="citekey">otegi-etal-2020-automatic</identifier>
<identifier type="doi">10.18653/v1/2020.nlpcovid19-2.15</identifier>
<location>
<url>https://aclanthology.org/2020.nlpcovid19-2.15</url>
</location>
<part>
<date>2020-12</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Evaluation vs. User Preference in Neural Textual QuestionAnswering over COVID-19 Scientific Literature
%A Otegi, Arantxa
%A Campos, Jon Ander
%A Azkune, Gorka
%A Soroa, Aitor
%A Agirre, Eneko
%Y Verspoor, Karin
%Y Cohen, Kevin Bretonnel
%Y Conway, Michael
%Y de Bruijn, Berry
%Y Dredze, Mark
%Y Mihalcea, Rada
%Y Wallace, Byron
%S Proceedings of the 1st Workshop on NLP for COVID-19 (Part 2) at EMNLP 2020
%D 2020
%8 December
%I Association for Computational Linguistics
%C Online
%F otegi-etal-2020-automatic
%X We present a Question Answering (QA) system that won one of the tasks of the Kaggle CORD-19 Challenge, according to the qualitative evaluation of experts. The system is a combination of an Information Retrieval module and a reading comprehension module that finds the answers in the retrieved passages. In this paper we present a quantitative and qualitative analysis of the system. The quantitative evaluation using manually annotated datasets contradicted some of our design choices, e.g. the fact that using QuAC for fine-tuning provided better answers over just using SQuAD. We analyzed this mismatch with an additional A/B test which showed that the system using QuAC was indeed preferred by users, confirming our intuition. Our analysis puts in question the suitability of automatic metrics and its correlation to user preferences. We also show that automatic metrics are highly dependent on the characteristics of the gold standard, such as the average length of the answers.
%R 10.18653/v1/2020.nlpcovid19-2.15
%U https://aclanthology.org/2020.nlpcovid19-2.15
%U https://doi.org/10.18653/v1/2020.nlpcovid19-2.15
Markdown (Informal)
[Automatic Evaluation vs. User Preference in Neural Textual QuestionAnswering over COVID-19 Scientific Literature](https://aclanthology.org/2020.nlpcovid19-2.15) (Otegi et al., NLP-COVID19 2020)
ACL