@inproceedings{klubicka-etal-2019-synthetic,
title = "Synthetic, yet natural: Properties of {W}ord{N}et random walk corpora and the impact of rare words on embedding performance",
author = "Klubi{\v{c}}ka, Filip and
Maldonado, Alfredo and
Mahalunkar, Abhijit and
Kelleher, John",
editor = "Vossen, Piek and
Fellbaum, Christiane",
booktitle = "Proceedings of the 10th Global Wordnet Conference",
month = jul,
year = "2019",
address = "Wroclaw, Poland",
publisher = "Global Wordnet Association",
url = "https://aclanthology.org/2019.gwc-1.18",
pages = "140--150",
abstract = "Creating word embeddings that reflect semantic relationships encoded in lexical knowledge resources is an open challenge. One approach is to use a random walk over a knowledge graph to generate a pseudo-corpus and use this corpus to train embeddings. However, the effect of the shape of the knowledge graph on the generated pseudo-corpora, and on the resulting word embeddings, has not been studied. To explore this, we use English WordNet, constrained to the taxonomic (tree-like) portion of the graph, as a case study. We investigate the properties of the generated pseudo-corpora, and their impact on the resulting embeddings. We find that the distributions in the psuedo-corpora exhibit properties found in natural corpora, such as Zipf{'}s and Heaps{'} law, and also observe that the proportion of rare words in a pseudo-corpus affects the performance of its embeddings on word similarity.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="klubicka-etal-2019-synthetic">
<titleInfo>
<title>Synthetic, yet natural: Properties of WordNet random walk corpora and the impact of rare words on embedding performance</title>
</titleInfo>
<name type="personal">
<namePart type="given">Filip</namePart>
<namePart type="family">Klubička</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alfredo</namePart>
<namePart type="family">Maldonado</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhijit</namePart>
<namePart type="family">Mahalunkar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Kelleher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Global Wordnet Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piek</namePart>
<namePart type="family">Vossen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christiane</namePart>
<namePart type="family">Fellbaum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Global Wordnet Association</publisher>
<place>
<placeTerm type="text">Wroclaw, Poland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Creating word embeddings that reflect semantic relationships encoded in lexical knowledge resources is an open challenge. One approach is to use a random walk over a knowledge graph to generate a pseudo-corpus and use this corpus to train embeddings. However, the effect of the shape of the knowledge graph on the generated pseudo-corpora, and on the resulting word embeddings, has not been studied. To explore this, we use English WordNet, constrained to the taxonomic (tree-like) portion of the graph, as a case study. We investigate the properties of the generated pseudo-corpora, and their impact on the resulting embeddings. We find that the distributions in the psuedo-corpora exhibit properties found in natural corpora, such as Zipf’s and Heaps’ law, and also observe that the proportion of rare words in a pseudo-corpus affects the performance of its embeddings on word similarity.</abstract>
<identifier type="citekey">klubicka-etal-2019-synthetic</identifier>
<location>
<url>https://aclanthology.org/2019.gwc-1.18</url>
</location>
<part>
<date>2019-07</date>
<extent unit="page">
<start>140</start>
<end>150</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Synthetic, yet natural: Properties of WordNet random walk corpora and the impact of rare words on embedding performance
%A Klubička, Filip
%A Maldonado, Alfredo
%A Mahalunkar, Abhijit
%A Kelleher, John
%Y Vossen, Piek
%Y Fellbaum, Christiane
%S Proceedings of the 10th Global Wordnet Conference
%D 2019
%8 July
%I Global Wordnet Association
%C Wroclaw, Poland
%F klubicka-etal-2019-synthetic
%X Creating word embeddings that reflect semantic relationships encoded in lexical knowledge resources is an open challenge. One approach is to use a random walk over a knowledge graph to generate a pseudo-corpus and use this corpus to train embeddings. However, the effect of the shape of the knowledge graph on the generated pseudo-corpora, and on the resulting word embeddings, has not been studied. To explore this, we use English WordNet, constrained to the taxonomic (tree-like) portion of the graph, as a case study. We investigate the properties of the generated pseudo-corpora, and their impact on the resulting embeddings. We find that the distributions in the psuedo-corpora exhibit properties found in natural corpora, such as Zipf’s and Heaps’ law, and also observe that the proportion of rare words in a pseudo-corpus affects the performance of its embeddings on word similarity.
%U https://aclanthology.org/2019.gwc-1.18
%P 140-150
Markdown (Informal)
[Synthetic, yet natural: Properties of WordNet random walk corpora and the impact of rare words on embedding performance](https://aclanthology.org/2019.gwc-1.18) (Klubička et al., GWC 2019)
ACL