@inproceedings{beersmans-etal-2024-gotta,
title = "{\textquotedblleft}Gotta catch {\textquoteleft}em all!{\textquotedblright}: Retrieving people in {A}ncient {G}reek texts combining transformer models and domain knowledge",
author = "Beersmans, Marijke and
Keersmaekers, Alek and
de Graaf, Evelien and
Van de Cruys, Tim and
Depauw, Mark and
Fantoli, Margherita",
editor = "Pavlopoulos, John and
Sommerschield, Thea and
Assael, Yannis and
Gordin, Shai and
Cho, Kyunghyun and
Passarotti, Marco and
Sprugnoli, Rachele and
Liu, Yudong and
Li, Bin and
Anderson, Adam",
booktitle = "Proceedings of the 1st Workshop on Machine Learning for Ancient Languages (ML4AL 2024)",
month = aug,
year = "2024",
address = "Hybrid in Bangkok, Thailand and online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.ml4al-1.16/",
doi = "10.18653/v1/2024.ml4al-1.16",
pages = "152--164",
abstract = "In this paper, we present a study of transformer-based Named Entity Recognition (NER) as applied to Ancient Greek texts, with an emphasis on retrieving personal names. Recent research shows that, while the task remains difficult, the use of transformer models results in significant improvements. We, therefore, compare the performance of four transformer models on the task of NER for the categories of people, locations and groups, and add an out-of-domain test set to the existing datasets. Results on this set highlight the shortcomings of the models when confronted with a random sample of sentences. To be able to more straightforwardly integrate domain and linguistic knowledge to improve performance, we narrow down our approach to the category of people. The task is simplified to a binary PERS/MISC classification on the token level, starting from capitalised words. Next, we test the use of domain and linguistic knowledge to improve the results. We find that including simple gazetteer information as a binary mask has a marginally positive effect on newly annotated data and that treebanks can be used to help identify multi-word individuals if they are scarcely or inconsistently annotated in the available training data. The qualitative error analysis identifies the potential for improvement in both manual annotation and the inclusion of domain and linguistic knowledge in the transformer models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="beersmans-etal-2024-gotta">
<titleInfo>
<title>“Gotta catch ‘em all!”: Retrieving people in Ancient Greek texts combining transformer models and domain knowledge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marijke</namePart>
<namePart type="family">Beersmans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alek</namePart>
<namePart type="family">Keersmaekers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Evelien</namePart>
<namePart type="family">de Graaf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Van de Cruys</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Depauw</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Margherita</namePart>
<namePart type="family">Fantoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Machine Learning for Ancient Languages (ML4AL 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Pavlopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thea</namePart>
<namePart type="family">Sommerschield</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yannis</namePart>
<namePart type="family">Assael</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shai</namePart>
<namePart type="family">Gordin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyunghyun</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Passarotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yudong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Anderson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hybrid in Bangkok, Thailand and online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we present a study of transformer-based Named Entity Recognition (NER) as applied to Ancient Greek texts, with an emphasis on retrieving personal names. Recent research shows that, while the task remains difficult, the use of transformer models results in significant improvements. We, therefore, compare the performance of four transformer models on the task of NER for the categories of people, locations and groups, and add an out-of-domain test set to the existing datasets. Results on this set highlight the shortcomings of the models when confronted with a random sample of sentences. To be able to more straightforwardly integrate domain and linguistic knowledge to improve performance, we narrow down our approach to the category of people. The task is simplified to a binary PERS/MISC classification on the token level, starting from capitalised words. Next, we test the use of domain and linguistic knowledge to improve the results. We find that including simple gazetteer information as a binary mask has a marginally positive effect on newly annotated data and that treebanks can be used to help identify multi-word individuals if they are scarcely or inconsistently annotated in the available training data. The qualitative error analysis identifies the potential for improvement in both manual annotation and the inclusion of domain and linguistic knowledge in the transformer models.</abstract>
<identifier type="citekey">beersmans-etal-2024-gotta</identifier>
<identifier type="doi">10.18653/v1/2024.ml4al-1.16</identifier>
<location>
<url>https://aclanthology.org/2024.ml4al-1.16/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>152</start>
<end>164</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T “Gotta catch ‘em all!”: Retrieving people in Ancient Greek texts combining transformer models and domain knowledge
%A Beersmans, Marijke
%A Keersmaekers, Alek
%A de Graaf, Evelien
%A Van de Cruys, Tim
%A Depauw, Mark
%A Fantoli, Margherita
%Y Pavlopoulos, John
%Y Sommerschield, Thea
%Y Assael, Yannis
%Y Gordin, Shai
%Y Cho, Kyunghyun
%Y Passarotti, Marco
%Y Sprugnoli, Rachele
%Y Liu, Yudong
%Y Li, Bin
%Y Anderson, Adam
%S Proceedings of the 1st Workshop on Machine Learning for Ancient Languages (ML4AL 2024)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Hybrid in Bangkok, Thailand and online
%F beersmans-etal-2024-gotta
%X In this paper, we present a study of transformer-based Named Entity Recognition (NER) as applied to Ancient Greek texts, with an emphasis on retrieving personal names. Recent research shows that, while the task remains difficult, the use of transformer models results in significant improvements. We, therefore, compare the performance of four transformer models on the task of NER for the categories of people, locations and groups, and add an out-of-domain test set to the existing datasets. Results on this set highlight the shortcomings of the models when confronted with a random sample of sentences. To be able to more straightforwardly integrate domain and linguistic knowledge to improve performance, we narrow down our approach to the category of people. The task is simplified to a binary PERS/MISC classification on the token level, starting from capitalised words. Next, we test the use of domain and linguistic knowledge to improve the results. We find that including simple gazetteer information as a binary mask has a marginally positive effect on newly annotated data and that treebanks can be used to help identify multi-word individuals if they are scarcely or inconsistently annotated in the available training data. The qualitative error analysis identifies the potential for improvement in both manual annotation and the inclusion of domain and linguistic knowledge in the transformer models.
%R 10.18653/v1/2024.ml4al-1.16
%U https://aclanthology.org/2024.ml4al-1.16/
%U https://doi.org/10.18653/v1/2024.ml4al-1.16
%P 152-164
Markdown (Informal)
[“Gotta catch ‘em all!”: Retrieving people in Ancient Greek texts combining transformer models and domain knowledge](https://aclanthology.org/2024.ml4al-1.16/) (Beersmans et al., ML4AL 2024)
ACL