@inproceedings{arnold-etal-2020-addressing,
title = "Addressing Cha(lle)nges in Long-Term Archiving of Large Corpora",
author = "Arnold, Denis and
Fisseni, Bernhard and
Kamocki, Pawel and
Schonefeld, Oliver and
Kupietz, Marc and
Schmidt, Thomas",
editor = {Ba{\'n}ski, Piotr and
Barbaresi, Adrien and
Clematide, Simon and
Kupietz, Marc and
L{\"u}ngen, Harald and
Pisetta, Ines},
booktitle = "Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Ressources Association",
url = "https://aclanthology.org/2020.cmlc-1.1/",
pages = "1--9",
language = "eng",
ISBN = "979-10-95546-61-0",
abstract = "This paper addresses long-term archival for large corpora. Three aspects specific to language resources are focused, namely (1) the removal of resources for legal reasons, (2) versioning of (unchanged) objects in constantly growing resources, especially where objects can be part of multiple releases but also part of different collections, and (3) the conversion of data to new formats for digital preservation. It is motivated why language resources may have to be changed, and why formats may need to be converted. As a solution, the use of an intermediate proxy object called a signpost is suggested. The approach will be exemplified with respect to the corpora of the Leibniz Institute for the German Language in Mannheim, namely the German Reference Corpus (DeReKo) and the Archive for Spoken German (AGD)."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arnold-etal-2020-addressing">
<titleInfo>
<title>Addressing Cha(lle)nges in Long-Term Archiving of Large Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Denis</namePart>
<namePart type="family">Arnold</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bernhard</namePart>
<namePart type="family">Fisseni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pawel</namePart>
<namePart type="family">Kamocki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oliver</namePart>
<namePart type="family">Schonefeld</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Kupietz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Schmidt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piotr</namePart>
<namePart type="family">Bański</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrien</namePart>
<namePart type="family">Barbaresi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Clematide</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Kupietz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harald</namePart>
<namePart type="family">Lüngen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ines</namePart>
<namePart type="family">Pisetta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Ressources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-61-0</identifier>
</relatedItem>
<abstract>This paper addresses long-term archival for large corpora. Three aspects specific to language resources are focused, namely (1) the removal of resources for legal reasons, (2) versioning of (unchanged) objects in constantly growing resources, especially where objects can be part of multiple releases but also part of different collections, and (3) the conversion of data to new formats for digital preservation. It is motivated why language resources may have to be changed, and why formats may need to be converted. As a solution, the use of an intermediate proxy object called a signpost is suggested. The approach will be exemplified with respect to the corpora of the Leibniz Institute for the German Language in Mannheim, namely the German Reference Corpus (DeReKo) and the Archive for Spoken German (AGD).</abstract>
<identifier type="citekey">arnold-etal-2020-addressing</identifier>
<location>
<url>https://aclanthology.org/2020.cmlc-1.1/</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>1</start>
<end>9</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Addressing Cha(lle)nges in Long-Term Archiving of Large Corpora
%A Arnold, Denis
%A Fisseni, Bernhard
%A Kamocki, Pawel
%A Schonefeld, Oliver
%A Kupietz, Marc
%A Schmidt, Thomas
%Y Bański, Piotr
%Y Barbaresi, Adrien
%Y Clematide, Simon
%Y Kupietz, Marc
%Y Lüngen, Harald
%Y Pisetta, Ines
%S Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora
%D 2020
%8 May
%I European Language Ressources Association
%C Marseille, France
%@ 979-10-95546-61-0
%G eng
%F arnold-etal-2020-addressing
%X This paper addresses long-term archival for large corpora. Three aspects specific to language resources are focused, namely (1) the removal of resources for legal reasons, (2) versioning of (unchanged) objects in constantly growing resources, especially where objects can be part of multiple releases but also part of different collections, and (3) the conversion of data to new formats for digital preservation. It is motivated why language resources may have to be changed, and why formats may need to be converted. As a solution, the use of an intermediate proxy object called a signpost is suggested. The approach will be exemplified with respect to the corpora of the Leibniz Institute for the German Language in Mannheim, namely the German Reference Corpus (DeReKo) and the Archive for Spoken German (AGD).
%U https://aclanthology.org/2020.cmlc-1.1/
%P 1-9
Markdown (Informal)
[Addressing Cha(lle)nges in Long-Term Archiving of Large Corpora](https://aclanthology.org/2020.cmlc-1.1/) (Arnold et al., CMLC 2020)
ACL
- Denis Arnold, Bernhard Fisseni, Pawel Kamocki, Oliver Schonefeld, Marc Kupietz, and Thomas Schmidt. 2020. Addressing Cha(lle)nges in Long-Term Archiving of Large Corpora. In Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora, pages 1–9, Marseille, France. European Language Ressources Association.