@inproceedings{korner-etal-2022-crawling,
title = "Crawling Under-Resourced Languages - a Portal for Community-Contributed Corpus Collection",
author = {K{\"o}rner, Erik and
Helfer, Felix and
Schr{\"o}der, Christopher and
Eckart, Thomas and
Goldhahn, Dirk},
editor = {S{\"a}lev{\"a}, Jonne and
Lignos, Constantine},
booktitle = "Proceedings of the Workshop on Dataset Creation for Lower-Resourced Languages within the 13th Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.dclrl-1.5",
pages = "36--43",
abstract = "The {``}Web as corpus{''} paradigm opens opportunities for enhancing the current state of language resources for endangered and under-resourced languages. However, standard crawling strategies tend to overlook available resources of these languages in favor of already well-documented ones. Since 2016, the {``}Crawling Under-Resourced Languages{''} portal (CURL) has been contributing to bridging the gap between established crawling techniques and knowledge about relevant Web resources that is only available in the specific language communities. The aim of the CURL portal is to enlarge the amount of available text material for under-resourced languages thereby developing available datasets further and to use them as a basis for statistical evaluation and enrichment of already available resources. The application is currently provided and further developed as part of the thematic cluster {``}Non-Latin scripts and Under-resourced languages{''} in the German national research consortium Text+. In this context, its focus lies on the extraction of text material and statistical information for the data domain {``}Lexical resources{''}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="korner-etal-2022-crawling">
<titleInfo>
<title>Crawling Under-Resourced Languages - a Portal for Community-Contributed Corpus Collection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Körner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Helfer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Schröder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Eckart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dirk</namePart>
<namePart type="family">Goldhahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Dataset Creation for Lower-Resourced Languages within the 13th Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jonne</namePart>
<namePart type="family">Sälevä</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Constantine</namePart>
<namePart type="family">Lignos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The “Web as corpus” paradigm opens opportunities for enhancing the current state of language resources for endangered and under-resourced languages. However, standard crawling strategies tend to overlook available resources of these languages in favor of already well-documented ones. Since 2016, the “Crawling Under-Resourced Languages” portal (CURL) has been contributing to bridging the gap between established crawling techniques and knowledge about relevant Web resources that is only available in the specific language communities. The aim of the CURL portal is to enlarge the amount of available text material for under-resourced languages thereby developing available datasets further and to use them as a basis for statistical evaluation and enrichment of already available resources. The application is currently provided and further developed as part of the thematic cluster “Non-Latin scripts and Under-resourced languages” in the German national research consortium Text+. In this context, its focus lies on the extraction of text material and statistical information for the data domain “Lexical resources”.</abstract>
<identifier type="citekey">korner-etal-2022-crawling</identifier>
<location>
<url>https://aclanthology.org/2022.dclrl-1.5</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>36</start>
<end>43</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Crawling Under-Resourced Languages - a Portal for Community-Contributed Corpus Collection
%A Körner, Erik
%A Helfer, Felix
%A Schröder, Christopher
%A Eckart, Thomas
%A Goldhahn, Dirk
%Y Sälevä, Jonne
%Y Lignos, Constantine
%S Proceedings of the Workshop on Dataset Creation for Lower-Resourced Languages within the 13th Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F korner-etal-2022-crawling
%X The “Web as corpus” paradigm opens opportunities for enhancing the current state of language resources for endangered and under-resourced languages. However, standard crawling strategies tend to overlook available resources of these languages in favor of already well-documented ones. Since 2016, the “Crawling Under-Resourced Languages” portal (CURL) has been contributing to bridging the gap between established crawling techniques and knowledge about relevant Web resources that is only available in the specific language communities. The aim of the CURL portal is to enlarge the amount of available text material for under-resourced languages thereby developing available datasets further and to use them as a basis for statistical evaluation and enrichment of already available resources. The application is currently provided and further developed as part of the thematic cluster “Non-Latin scripts and Under-resourced languages” in the German national research consortium Text+. In this context, its focus lies on the extraction of text material and statistical information for the data domain “Lexical resources”.
%U https://aclanthology.org/2022.dclrl-1.5
%P 36-43
Markdown (Informal)
[Crawling Under-Resourced Languages - a Portal for Community-Contributed Corpus Collection](https://aclanthology.org/2022.dclrl-1.5) (Körner et al., DCLRL 2022)
ACL