@inproceedings{tian-etal-2021-bahp,
title = "{BAHP}: Benchmark of Assessing Word Embeddings in Historical {P}ortuguese",
author = "Tian, Zuoyu and
Jarrett, Dylan and
Escalona Torres, Juan and
Amaral, Patricia",
editor = "Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Reiter, Nils and
Szpakowicz, Stan",
booktitle = "Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic (online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.latechclfl-1.13",
doi = "10.18653/v1/2021.latechclfl-1.13",
pages = "113--119",
abstract = "High quality distributional models can capture lexical and semantic relations between words. Hence, researchers design various intrinsic tasks to test whether such relations are captured. However, most of the intrinsic tasks are designed for modern languages, and there is a lack of evaluation methods for distributional models of historical corpora. In this paper, we conducted BAHP: a benchmark of assessing word embeddings in Historical Portuguese, which contains four types of tests: analogy, similarity, outlier detection, and coherence. We examined word2vec models generated from two historical Portuguese corpora in these four test sets. The results demonstrate that our test sets are capable of measuring the quality of vector space models and can provide a holistic view of the model{'}s ability to capture syntactic and semantic information. Furthermore, the methodology for the creation of our test sets can be easily extended to other historical languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tian-etal-2021-bahp">
<titleInfo>
<title>BAHP: Benchmark of Assessing Word Embeddings in Historical Portuguese</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zuoyu</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dylan</namePart>
<namePart type="family">Jarrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Escalona Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patricia</namePart>
<namePart type="family">Amaral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano-Ortlieb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nils</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic (online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>High quality distributional models can capture lexical and semantic relations between words. Hence, researchers design various intrinsic tasks to test whether such relations are captured. However, most of the intrinsic tasks are designed for modern languages, and there is a lack of evaluation methods for distributional models of historical corpora. In this paper, we conducted BAHP: a benchmark of assessing word embeddings in Historical Portuguese, which contains four types of tests: analogy, similarity, outlier detection, and coherence. We examined word2vec models generated from two historical Portuguese corpora in these four test sets. The results demonstrate that our test sets are capable of measuring the quality of vector space models and can provide a holistic view of the model’s ability to capture syntactic and semantic information. Furthermore, the methodology for the creation of our test sets can be easily extended to other historical languages.</abstract>
<identifier type="citekey">tian-etal-2021-bahp</identifier>
<identifier type="doi">10.18653/v1/2021.latechclfl-1.13</identifier>
<location>
<url>https://aclanthology.org/2021.latechclfl-1.13</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>113</start>
<end>119</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BAHP: Benchmark of Assessing Word Embeddings in Historical Portuguese
%A Tian, Zuoyu
%A Jarrett, Dylan
%A Escalona Torres, Juan
%A Amaral, Patricia
%Y Degaetano-Ortlieb, Stefania
%Y Kazantseva, Anna
%Y Reiter, Nils
%Y Szpakowicz, Stan
%S Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic (online)
%F tian-etal-2021-bahp
%X High quality distributional models can capture lexical and semantic relations between words. Hence, researchers design various intrinsic tasks to test whether such relations are captured. However, most of the intrinsic tasks are designed for modern languages, and there is a lack of evaluation methods for distributional models of historical corpora. In this paper, we conducted BAHP: a benchmark of assessing word embeddings in Historical Portuguese, which contains four types of tests: analogy, similarity, outlier detection, and coherence. We examined word2vec models generated from two historical Portuguese corpora in these four test sets. The results demonstrate that our test sets are capable of measuring the quality of vector space models and can provide a holistic view of the model’s ability to capture syntactic and semantic information. Furthermore, the methodology for the creation of our test sets can be easily extended to other historical languages.
%R 10.18653/v1/2021.latechclfl-1.13
%U https://aclanthology.org/2021.latechclfl-1.13
%U https://doi.org/10.18653/v1/2021.latechclfl-1.13
%P 113-119
Markdown (Informal)
[BAHP: Benchmark of Assessing Word Embeddings in Historical Portuguese](https://aclanthology.org/2021.latechclfl-1.13) (Tian et al., LaTeCHCLfL 2021)
ACL
- Zuoyu Tian, Dylan Jarrett, Juan Escalona Torres, and Patricia Amaral. 2021. BAHP: Benchmark of Assessing Word Embeddings in Historical Portuguese. In Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, pages 113–119, Punta Cana, Dominican Republic (online). Association for Computational Linguistics.