@inproceedings{fleisig-etal-2023-majority,
title = "When the Majority is Wrong: Modeling Annotator Disagreement for Subjective Tasks",
author = "Fleisig, Eve and
Abebe, Rediet and
Klein, Dan",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.415",
doi = "10.18653/v1/2023.emnlp-main.415",
pages = "6715--6726",
abstract = "Though majority vote among annotators is typically used for ground truth labels in machine learning, annotator disagreement in tasks such as hate speech detection may reflect systematic differences in opinion across groups, not noise. Thus, a crucial problem in hate speech detection is determining if a statement is offensive to the demographic group that it targets, when that group may be a small fraction of the annotator pool. We construct a model that predicts individual annotator ratings on potentially offensive text and combines this information with the predicted target group of the text to predict the ratings of target group members. We show gains across a range of metrics, including raising performance over the baseline by 22{\%} at predicting individual annotators{'} ratings and by 33{\%} at predicting variance among annotators, which provides a metric for model uncertainty downstream. We find that annotators{'} ratings can be predicted using their demographic information as well as opinions on online content, and that non-invasive questions on annotators{'} online experiences minimize the need to collect demographic information when predicting annotators{'} opinions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fleisig-etal-2023-majority">
<titleInfo>
<title>When the Majority is Wrong: Modeling Annotator Disagreement for Subjective Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eve</namePart>
<namePart type="family">Fleisig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rediet</namePart>
<namePart type="family">Abebe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Klein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Though majority vote among annotators is typically used for ground truth labels in machine learning, annotator disagreement in tasks such as hate speech detection may reflect systematic differences in opinion across groups, not noise. Thus, a crucial problem in hate speech detection is determining if a statement is offensive to the demographic group that it targets, when that group may be a small fraction of the annotator pool. We construct a model that predicts individual annotator ratings on potentially offensive text and combines this information with the predicted target group of the text to predict the ratings of target group members. We show gains across a range of metrics, including raising performance over the baseline by 22% at predicting individual annotators’ ratings and by 33% at predicting variance among annotators, which provides a metric for model uncertainty downstream. We find that annotators’ ratings can be predicted using their demographic information as well as opinions on online content, and that non-invasive questions on annotators’ online experiences minimize the need to collect demographic information when predicting annotators’ opinions.</abstract>
<identifier type="citekey">fleisig-etal-2023-majority</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.415</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.415</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>6715</start>
<end>6726</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When the Majority is Wrong: Modeling Annotator Disagreement for Subjective Tasks
%A Fleisig, Eve
%A Abebe, Rediet
%A Klein, Dan
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F fleisig-etal-2023-majority
%X Though majority vote among annotators is typically used for ground truth labels in machine learning, annotator disagreement in tasks such as hate speech detection may reflect systematic differences in opinion across groups, not noise. Thus, a crucial problem in hate speech detection is determining if a statement is offensive to the demographic group that it targets, when that group may be a small fraction of the annotator pool. We construct a model that predicts individual annotator ratings on potentially offensive text and combines this information with the predicted target group of the text to predict the ratings of target group members. We show gains across a range of metrics, including raising performance over the baseline by 22% at predicting individual annotators’ ratings and by 33% at predicting variance among annotators, which provides a metric for model uncertainty downstream. We find that annotators’ ratings can be predicted using their demographic information as well as opinions on online content, and that non-invasive questions on annotators’ online experiences minimize the need to collect demographic information when predicting annotators’ opinions.
%R 10.18653/v1/2023.emnlp-main.415
%U https://aclanthology.org/2023.emnlp-main.415
%U https://doi.org/10.18653/v1/2023.emnlp-main.415
%P 6715-6726
Markdown (Informal)
[When the Majority is Wrong: Modeling Annotator Disagreement for Subjective Tasks](https://aclanthology.org/2023.emnlp-main.415) (Fleisig et al., EMNLP 2023)
ACL