@inproceedings{mhaske-etal-2023-naamapadam,
title = "Naamapadam: A Large-Scale Named Entity Annotated Data for {I}ndic Languages",
author = "Mhaske, Arnav and
Kedia, Harshit and
Doddapaneni, Sumanth and
Khapra, Mitesh M. and
Kumar, Pratyush and
Murthy, Rudra and
Kunchukuttan, Anoop",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.582/",
doi = "10.18653/v1/2023.acl-long.582",
pages = "10441--10456",
abstract = "We present, \textit{Naamapadam}, the largest publicly available Named Entity Recognition (NER) dataset for the 11 major Indian languages from two language families. The dataset contains more than 400k sentences annotated with a total of at least 100k entities from three standard entity categories (Person, Location, and, Organization) for 9 out of the 11 languages. The training dataset has been automatically created from the Samanantar parallel corpus by projecting automatically tagged entities from an English sentence to the corresponding Indian language translation. We also create manually annotated testsets for 9 languages. We demonstrate the utility of the obtained dataset on the Naamapadam-test dataset. We also release \textit{IndicNER}, a multilingual IndicBERT model fine-tuned on Naamapadam training set. IndicNER achieves an F1 score of more than 80 for 7 out of 9 test languages. The dataset and models are available under open-source licences at \url{https://ai4bharat.iitm.ac.in/naamapadam}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mhaske-etal-2023-naamapadam">
<titleInfo>
<title>Naamapadam: A Large-Scale Named Entity Annotated Data for Indic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arnav</namePart>
<namePart type="family">Mhaske</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harshit</namePart>
<namePart type="family">Kedia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sumanth</namePart>
<namePart type="family">Doddapaneni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mitesh</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Khapra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pratyush</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rudra</namePart>
<namePart type="family">Murthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kunchukuttan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present, Naamapadam, the largest publicly available Named Entity Recognition (NER) dataset for the 11 major Indian languages from two language families. The dataset contains more than 400k sentences annotated with a total of at least 100k entities from three standard entity categories (Person, Location, and, Organization) for 9 out of the 11 languages. The training dataset has been automatically created from the Samanantar parallel corpus by projecting automatically tagged entities from an English sentence to the corresponding Indian language translation. We also create manually annotated testsets for 9 languages. We demonstrate the utility of the obtained dataset on the Naamapadam-test dataset. We also release IndicNER, a multilingual IndicBERT model fine-tuned on Naamapadam training set. IndicNER achieves an F1 score of more than 80 for 7 out of 9 test languages. The dataset and models are available under open-source licences at https://ai4bharat.iitm.ac.in/naamapadam.</abstract>
<identifier type="citekey">mhaske-etal-2023-naamapadam</identifier>
<identifier type="doi">10.18653/v1/2023.acl-long.582</identifier>
<location>
<url>https://aclanthology.org/2023.acl-long.582/</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>10441</start>
<end>10456</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Naamapadam: A Large-Scale Named Entity Annotated Data for Indic Languages
%A Mhaske, Arnav
%A Kedia, Harshit
%A Doddapaneni, Sumanth
%A Khapra, Mitesh M.
%A Kumar, Pratyush
%A Murthy, Rudra
%A Kunchukuttan, Anoop
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F mhaske-etal-2023-naamapadam
%X We present, Naamapadam, the largest publicly available Named Entity Recognition (NER) dataset for the 11 major Indian languages from two language families. The dataset contains more than 400k sentences annotated with a total of at least 100k entities from three standard entity categories (Person, Location, and, Organization) for 9 out of the 11 languages. The training dataset has been automatically created from the Samanantar parallel corpus by projecting automatically tagged entities from an English sentence to the corresponding Indian language translation. We also create manually annotated testsets for 9 languages. We demonstrate the utility of the obtained dataset on the Naamapadam-test dataset. We also release IndicNER, a multilingual IndicBERT model fine-tuned on Naamapadam training set. IndicNER achieves an F1 score of more than 80 for 7 out of 9 test languages. The dataset and models are available under open-source licences at https://ai4bharat.iitm.ac.in/naamapadam.
%R 10.18653/v1/2023.acl-long.582
%U https://aclanthology.org/2023.acl-long.582/
%U https://doi.org/10.18653/v1/2023.acl-long.582
%P 10441-10456
Markdown (Informal)
[Naamapadam: A Large-Scale Named Entity Annotated Data for Indic Languages](https://aclanthology.org/2023.acl-long.582/) (Mhaske et al., ACL 2023)
ACL
- Arnav Mhaske, Harshit Kedia, Sumanth Doddapaneni, Mitesh M. Khapra, Pratyush Kumar, Rudra Murthy, and Anoop Kunchukuttan. 2023. Naamapadam: A Large-Scale Named Entity Annotated Data for Indic Languages. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 10441–10456, Toronto, Canada. Association for Computational Linguistics.