@inproceedings{sharif-etal-2022-bad,
title = "{M}-{BAD}: A Multilabel Dataset for Detecting Aggressive Texts and Their Targets",
author = "Sharif, Omar and
Hossain, Eftekhar and
Hoque, Mohammed Moshiul",
editor = "Chakraborty, Tanmoy and
Akhtar, Md. Shad and
Shu, Kai and
Bernard, H. Russell and
Liakata, Maria and
Nakov, Preslav and
Srivastava, Aseem",
booktitle = "Proceedings of the Workshop on Combating Online Hostile Posts in Regional Languages during Emergency Situations",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.constraint-1.9/",
doi = "10.18653/v1/2022.constraint-1.9",
pages = "75--85",
abstract = "Recently, detection and categorization of undesired (e. g., aggressive, abusive, offensive, hate) content from online platforms has grabbed the attention of researchers because of its detrimental impact on society. Several attempts have been made to mitigate the usage and propagation of such content. However, most past studies were conducted primarily for English, where low-resource languages like Bengali remained out of the focus. Therefore, to facilitate research in this arena, this paper introduces a novel multilabel Bengali dataset (named M-BAD) containing 15650 texts to detect aggressive texts and their targets. Each text of M-BAD went through rigorous two-level annotations. At the primary level, each text is labelled as either aggressive or non-aggressive. In the secondary level, the aggressive texts have been further annotated into five fine-grained target classes: religion, politics, verbal, gender and race. Baseline experiments are carried out with different machine learning (ML), deep learning (DL) and transformer models, where Bangla-BERT acquired the highest weighted $f_1$-score in both detection (0.92) and target identification (0.83) tasks. Error analysis of the models exhibits the difficulty to identify context-dependent aggression, and this work argues that further research is required to address these issues."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sharif-etal-2022-bad">
<titleInfo>
<title>M-BAD: A Multilabel Dataset for Detecting Aggressive Texts and Their Targets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Omar</namePart>
<namePart type="family">Sharif</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eftekhar</namePart>
<namePart type="family">Hossain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammed</namePart>
<namePart type="given">Moshiul</namePart>
<namePart type="family">Hoque</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Combating Online Hostile Posts in Regional Languages during Emergency Situations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Shad</namePart>
<namePart type="family">Akhtar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Shu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">H</namePart>
<namePart type="given">Russell</namePart>
<namePart type="family">Bernard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aseem</namePart>
<namePart type="family">Srivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recently, detection and categorization of undesired (e. g., aggressive, abusive, offensive, hate) content from online platforms has grabbed the attention of researchers because of its detrimental impact on society. Several attempts have been made to mitigate the usage and propagation of such content. However, most past studies were conducted primarily for English, where low-resource languages like Bengali remained out of the focus. Therefore, to facilitate research in this arena, this paper introduces a novel multilabel Bengali dataset (named M-BAD) containing 15650 texts to detect aggressive texts and their targets. Each text of M-BAD went through rigorous two-level annotations. At the primary level, each text is labelled as either aggressive or non-aggressive. In the secondary level, the aggressive texts have been further annotated into five fine-grained target classes: religion, politics, verbal, gender and race. Baseline experiments are carried out with different machine learning (ML), deep learning (DL) and transformer models, where Bangla-BERT acquired the highest weighted f₁-score in both detection (0.92) and target identification (0.83) tasks. Error analysis of the models exhibits the difficulty to identify context-dependent aggression, and this work argues that further research is required to address these issues.</abstract>
<identifier type="citekey">sharif-etal-2022-bad</identifier>
<identifier type="doi">10.18653/v1/2022.constraint-1.9</identifier>
<location>
<url>https://aclanthology.org/2022.constraint-1.9/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>75</start>
<end>85</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T M-BAD: A Multilabel Dataset for Detecting Aggressive Texts and Their Targets
%A Sharif, Omar
%A Hossain, Eftekhar
%A Hoque, Mohammed Moshiul
%Y Chakraborty, Tanmoy
%Y Akhtar, Md. Shad
%Y Shu, Kai
%Y Bernard, H. Russell
%Y Liakata, Maria
%Y Nakov, Preslav
%Y Srivastava, Aseem
%S Proceedings of the Workshop on Combating Online Hostile Posts in Regional Languages during Emergency Situations
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F sharif-etal-2022-bad
%X Recently, detection and categorization of undesired (e. g., aggressive, abusive, offensive, hate) content from online platforms has grabbed the attention of researchers because of its detrimental impact on society. Several attempts have been made to mitigate the usage and propagation of such content. However, most past studies were conducted primarily for English, where low-resource languages like Bengali remained out of the focus. Therefore, to facilitate research in this arena, this paper introduces a novel multilabel Bengali dataset (named M-BAD) containing 15650 texts to detect aggressive texts and their targets. Each text of M-BAD went through rigorous two-level annotations. At the primary level, each text is labelled as either aggressive or non-aggressive. In the secondary level, the aggressive texts have been further annotated into five fine-grained target classes: religion, politics, verbal, gender and race. Baseline experiments are carried out with different machine learning (ML), deep learning (DL) and transformer models, where Bangla-BERT acquired the highest weighted f₁-score in both detection (0.92) and target identification (0.83) tasks. Error analysis of the models exhibits the difficulty to identify context-dependent aggression, and this work argues that further research is required to address these issues.
%R 10.18653/v1/2022.constraint-1.9
%U https://aclanthology.org/2022.constraint-1.9/
%U https://doi.org/10.18653/v1/2022.constraint-1.9
%P 75-85
Markdown (Informal)
[M-BAD: A Multilabel Dataset for Detecting Aggressive Texts and Their Targets](https://aclanthology.org/2022.constraint-1.9/) (Sharif et al., CONSTRAINT 2022)
ACL