@inproceedings{frick-schmidt-2020-using,
title = "Using full text indices for querying spoken language data",
author = "Frick, Elena and
Schmidt, Thomas",
editor = {Ba{\'n}ski, Piotr and
Barbaresi, Adrien and
Clematide, Simon and
Kupietz, Marc and
L{\"u}ngen, Harald and
Pisetta, Ines},
booktitle = "Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Ressources Association",
url = "https://aclanthology.org/2020.cmlc-1.6",
pages = "40--46",
abstract = "As a part of the ZuMult-project, we are currently modelling a backend architecture that should provide query access to corpora from the Archive of Spoken German (AGD) at the Leibniz-Institute for the German Language (IDS). We are exploring how to reuse existing search engine frameworks providing full text indices and allowing to query corpora by one of the corpus query languages (QLs) established and actively used in the corpus research community. For this purpose, we tested MTAS - an open source Lucene-based search engine for querying on text with multilevel annotations. We applied MTAS on three oral corpora stored in the TEI-based ISO standard for transcriptions of spoken language (ISO 24624:2016). These corpora differ from the corpus data that MTAS was developed for, because they include interactions with two and more speakers and are enriched, inter alia, with timeline-based annotations. In this contribution, we report our test results and address issues that arise when search frameworks originally developed for querying written corpora are being transferred into the field of spoken language.",
language = "English",
ISBN = "979-10-95546-61-0",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="frick-schmidt-2020-using">
<titleInfo>
<title>Using full text indices for querying spoken language data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Frick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Schmidt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piotr</namePart>
<namePart type="family">Bański</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrien</namePart>
<namePart type="family">Barbaresi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Clematide</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Kupietz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harald</namePart>
<namePart type="family">Lüngen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ines</namePart>
<namePart type="family">Pisetta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Ressources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-61-0</identifier>
</relatedItem>
<abstract>As a part of the ZuMult-project, we are currently modelling a backend architecture that should provide query access to corpora from the Archive of Spoken German (AGD) at the Leibniz-Institute for the German Language (IDS). We are exploring how to reuse existing search engine frameworks providing full text indices and allowing to query corpora by one of the corpus query languages (QLs) established and actively used in the corpus research community. For this purpose, we tested MTAS - an open source Lucene-based search engine for querying on text with multilevel annotations. We applied MTAS on three oral corpora stored in the TEI-based ISO standard for transcriptions of spoken language (ISO 24624:2016). These corpora differ from the corpus data that MTAS was developed for, because they include interactions with two and more speakers and are enriched, inter alia, with timeline-based annotations. In this contribution, we report our test results and address issues that arise when search frameworks originally developed for querying written corpora are being transferred into the field of spoken language.</abstract>
<identifier type="citekey">frick-schmidt-2020-using</identifier>
<location>
<url>https://aclanthology.org/2020.cmlc-1.6</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>40</start>
<end>46</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Using full text indices for querying spoken language data
%A Frick, Elena
%A Schmidt, Thomas
%Y Bański, Piotr
%Y Barbaresi, Adrien
%Y Clematide, Simon
%Y Kupietz, Marc
%Y Lüngen, Harald
%Y Pisetta, Ines
%S Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora
%D 2020
%8 May
%I European Language Ressources Association
%C Marseille, France
%@ 979-10-95546-61-0
%G English
%F frick-schmidt-2020-using
%X As a part of the ZuMult-project, we are currently modelling a backend architecture that should provide query access to corpora from the Archive of Spoken German (AGD) at the Leibniz-Institute for the German Language (IDS). We are exploring how to reuse existing search engine frameworks providing full text indices and allowing to query corpora by one of the corpus query languages (QLs) established and actively used in the corpus research community. For this purpose, we tested MTAS - an open source Lucene-based search engine for querying on text with multilevel annotations. We applied MTAS on three oral corpora stored in the TEI-based ISO standard for transcriptions of spoken language (ISO 24624:2016). These corpora differ from the corpus data that MTAS was developed for, because they include interactions with two and more speakers and are enriched, inter alia, with timeline-based annotations. In this contribution, we report our test results and address issues that arise when search frameworks originally developed for querying written corpora are being transferred into the field of spoken language.
%U https://aclanthology.org/2020.cmlc-1.6
%P 40-46
Markdown (Informal)
[Using full text indices for querying spoken language data](https://aclanthology.org/2020.cmlc-1.6) (Frick & Schmidt, CMLC 2020)
ACL