@inproceedings{strobl-etal-2020-wexea,
title = "{WEXEA}: {W}ikipedia {EX}haustive Entity Annotation",
author = "Strobl, Michael and
Trabelsi, Amine and
Zaiane, Osmar",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.240",
pages = "1951--1958",
abstract = "Building predictive models for information extraction from text, such as named entity recognition or the extraction of semantic relationships between named entities in text, requires a large corpus of annotated text. Wikipedia is often used as a corpus for these tasks where the annotation is a named entity linked by a hyperlink to its article. However, editors on Wikipedia are only expected to link these mentions in order to help the reader to understand the content, but are discouraged from adding links that do not add any benefit for understanding an article. Therefore, many mentions of popular entities (such as countries or popular events in history), or previously linked articles, as well as the article{'}s entity itself, are not linked. In this paper, we discuss WEXEA, a Wikipedia EXhaustive Entity Annotation system, to create a text corpus based on Wikipedia with exhaustive annotations of entity mentions, i.e. linking all mentions of entities to their corresponding articles. This results in a huge potential for additional annotations that can be used for downstream NLP tasks, such as Relation Extraction. We show that our annotations are useful for creating distantly supervised datasets for this task. Furthermore, we publish all code necessary to derive a corpus from a raw Wikipedia dump, so that it can be reproduced by everyone.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="strobl-etal-2020-wexea">
<titleInfo>
<title>WEXEA: Wikipedia EXhaustive Entity Annotation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Strobl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amine</namePart>
<namePart type="family">Trabelsi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Osmar</namePart>
<namePart type="family">Zaiane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>Building predictive models for information extraction from text, such as named entity recognition or the extraction of semantic relationships between named entities in text, requires a large corpus of annotated text. Wikipedia is often used as a corpus for these tasks where the annotation is a named entity linked by a hyperlink to its article. However, editors on Wikipedia are only expected to link these mentions in order to help the reader to understand the content, but are discouraged from adding links that do not add any benefit for understanding an article. Therefore, many mentions of popular entities (such as countries or popular events in history), or previously linked articles, as well as the article’s entity itself, are not linked. In this paper, we discuss WEXEA, a Wikipedia EXhaustive Entity Annotation system, to create a text corpus based on Wikipedia with exhaustive annotations of entity mentions, i.e. linking all mentions of entities to their corresponding articles. This results in a huge potential for additional annotations that can be used for downstream NLP tasks, such as Relation Extraction. We show that our annotations are useful for creating distantly supervised datasets for this task. Furthermore, we publish all code necessary to derive a corpus from a raw Wikipedia dump, so that it can be reproduced by everyone.</abstract>
<identifier type="citekey">strobl-etal-2020-wexea</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.240</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>1951</start>
<end>1958</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WEXEA: Wikipedia EXhaustive Entity Annotation
%A Strobl, Michael
%A Trabelsi, Amine
%A Zaiane, Osmar
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F strobl-etal-2020-wexea
%X Building predictive models for information extraction from text, such as named entity recognition or the extraction of semantic relationships between named entities in text, requires a large corpus of annotated text. Wikipedia is often used as a corpus for these tasks where the annotation is a named entity linked by a hyperlink to its article. However, editors on Wikipedia are only expected to link these mentions in order to help the reader to understand the content, but are discouraged from adding links that do not add any benefit for understanding an article. Therefore, many mentions of popular entities (such as countries or popular events in history), or previously linked articles, as well as the article’s entity itself, are not linked. In this paper, we discuss WEXEA, a Wikipedia EXhaustive Entity Annotation system, to create a text corpus based on Wikipedia with exhaustive annotations of entity mentions, i.e. linking all mentions of entities to their corresponding articles. This results in a huge potential for additional annotations that can be used for downstream NLP tasks, such as Relation Extraction. We show that our annotations are useful for creating distantly supervised datasets for this task. Furthermore, we publish all code necessary to derive a corpus from a raw Wikipedia dump, so that it can be reproduced by everyone.
%U https://aclanthology.org/2020.lrec-1.240
%P 1951-1958
Markdown (Informal)
[WEXEA: Wikipedia EXhaustive Entity Annotation](https://aclanthology.org/2020.lrec-1.240) (Strobl et al., LREC 2020)
ACL
- Michael Strobl, Amine Trabelsi, and Osmar Zaiane. 2020. WEXEA: Wikipedia EXhaustive Entity Annotation. In Proceedings of the Twelfth Language Resources and Evaluation Conference, pages 1951–1958, Marseille, France. European Language Resources Association.