@inproceedings{canizares-diaz-etal-2021-active,
title = "Active Learning for Assisted Corpus Construction: A Case Study in Knowledge Discovery from Biomedical Text",
author = "Ca{\~n}izares-D{\'\i}az, Hian and
Piad-Morffis, Alejandro and
Estevez-Velarde, Suilan and
Guti{\'e}rrez, Yoan and
Almeida Cruz, Yudivi{\'a}n and
Montoyo, Andres and
Mu{\~n}oz-Guillena, Rafael",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)",
month = sep,
year = "2021",
address = "Held Online",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/2021.ranlp-1.26",
pages = "216--225",
abstract = "This paper presents an active learning approach that aims to reduce the human effort required during the annotation of natural language corpora composed of entities and semantic relations. Our approach assists human annotators by intelligently selecting the most informative sentences to annotate and then pre-annotating them with a few highly accurate entities and semantic relations. We define an uncertainty-based query strategy with a weighted density factor, using similarity metrics based on sentence embeddings. As a case study, we evaluate our approach via simulation in a biomedical corpus and estimate the potential reduction in total annotation time. Experimental results suggest that the query strategy reduces by between 35{\%} and 40{\%} the number of sentences that must be manually annotated to develop systems able to reach a target F1 score, while the pre-annotation strategy produces an additional 24{\%} reduction in the total annotation time. Overall, our preliminary experiments suggest that as much as 60{\%} of the annotation time could be saved while producing corpora that have the same usefulness for training machine learning algorithms. An open-source computational tool that implements the aforementioned strategies is presented and published online for the research community.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="canizares-diaz-etal-2021-active">
<titleInfo>
<title>Active Learning for Assisted Corpus Construction: A Case Study in Knowledge Discovery from Biomedical Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hian</namePart>
<namePart type="family">Cañizares-Díaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alejandro</namePart>
<namePart type="family">Piad-Morffis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suilan</namePart>
<namePart type="family">Estevez-Velarde</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoan</namePart>
<namePart type="family">Gutiérrez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yudivián</namePart>
<namePart type="family">Almeida Cruz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andres</namePart>
<namePart type="family">Montoyo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rafael</namePart>
<namePart type="family">Muñoz-Guillena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Held Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents an active learning approach that aims to reduce the human effort required during the annotation of natural language corpora composed of entities and semantic relations. Our approach assists human annotators by intelligently selecting the most informative sentences to annotate and then pre-annotating them with a few highly accurate entities and semantic relations. We define an uncertainty-based query strategy with a weighted density factor, using similarity metrics based on sentence embeddings. As a case study, we evaluate our approach via simulation in a biomedical corpus and estimate the potential reduction in total annotation time. Experimental results suggest that the query strategy reduces by between 35% and 40% the number of sentences that must be manually annotated to develop systems able to reach a target F1 score, while the pre-annotation strategy produces an additional 24% reduction in the total annotation time. Overall, our preliminary experiments suggest that as much as 60% of the annotation time could be saved while producing corpora that have the same usefulness for training machine learning algorithms. An open-source computational tool that implements the aforementioned strategies is presented and published online for the research community.</abstract>
<identifier type="citekey">canizares-diaz-etal-2021-active</identifier>
<location>
<url>https://aclanthology.org/2021.ranlp-1.26</url>
</location>
<part>
<date>2021-09</date>
<extent unit="page">
<start>216</start>
<end>225</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Active Learning for Assisted Corpus Construction: A Case Study in Knowledge Discovery from Biomedical Text
%A Cañizares-Díaz, Hian
%A Piad-Morffis, Alejandro
%A Estevez-Velarde, Suilan
%A Gutiérrez, Yoan
%A Almeida Cruz, Yudivián
%A Montoyo, Andres
%A Muñoz-Guillena, Rafael
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)
%D 2021
%8 September
%I INCOMA Ltd.
%C Held Online
%F canizares-diaz-etal-2021-active
%X This paper presents an active learning approach that aims to reduce the human effort required during the annotation of natural language corpora composed of entities and semantic relations. Our approach assists human annotators by intelligently selecting the most informative sentences to annotate and then pre-annotating them with a few highly accurate entities and semantic relations. We define an uncertainty-based query strategy with a weighted density factor, using similarity metrics based on sentence embeddings. As a case study, we evaluate our approach via simulation in a biomedical corpus and estimate the potential reduction in total annotation time. Experimental results suggest that the query strategy reduces by between 35% and 40% the number of sentences that must be manually annotated to develop systems able to reach a target F1 score, while the pre-annotation strategy produces an additional 24% reduction in the total annotation time. Overall, our preliminary experiments suggest that as much as 60% of the annotation time could be saved while producing corpora that have the same usefulness for training machine learning algorithms. An open-source computational tool that implements the aforementioned strategies is presented and published online for the research community.
%U https://aclanthology.org/2021.ranlp-1.26
%P 216-225
Markdown (Informal)
[Active Learning for Assisted Corpus Construction: A Case Study in Knowledge Discovery from Biomedical Text](https://aclanthology.org/2021.ranlp-1.26) (Cañizares-Díaz et al., RANLP 2021)
ACL