@inproceedings{brkic-bakaric-lalli-pacelat-2019-parallel,
title = "Parallel Corpus of {C}roatian-{I}talian Administrative Texts",
author = "Brkic Bakaric, Marija and
Lalli Pacelat, Ivana",
booktitle = "Proceedings of the Human-Informed Translation and Interpreting Technology Workshop (HiT-IT 2019)",
month = sep,
year = "2019",
address = "Varna, Bulgaria",
publisher = "Incoma Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/W19-8702",
doi = "10.26615/issn.2683-0078.2019_002",
pages = "11--18",
abstract = "Parallel corpora constitute a unique re-source for providing assistance to human translators. The selection and preparation of the parallel corpora also conditions the quality of the resulting MT engine. Since Croatian is a national language and Italian is officially recognized as a minority lan-guage in seven cities and twelve munici-palities of Istria County, a large amount of parallel texts is produced on a daily basis. However, there have been no attempts in using these texts for compiling a parallel corpus. A domain-specific sentence-aligned parallel Croatian-Italian corpus of administrative texts would be of high value in creating different language tools and resources. The aim of this paper is, therefore, to explore the value of parallel documents which are publicly available mostly in pdf format and to investigate the use of automatically-built dictionaries in corpus compilation. The effects that a document format and, consequently sentence splitting, and the dictionary input have on the sentence alignment process are manually evaluated.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="brkic-bakaric-lalli-pacelat-2019-parallel">
<titleInfo>
<title>Parallel Corpus of Croatian-Italian Administrative Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marija</namePart>
<namePart type="family">Brkic Bakaric</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivana</namePart>
<namePart type="family">Lalli Pacelat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Human-Informed Translation and Interpreting Technology Workshop (HiT-IT 2019)</title>
</titleInfo>
<originInfo>
<publisher>Incoma Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Parallel corpora constitute a unique re-source for providing assistance to human translators. The selection and preparation of the parallel corpora also conditions the quality of the resulting MT engine. Since Croatian is a national language and Italian is officially recognized as a minority lan-guage in seven cities and twelve munici-palities of Istria County, a large amount of parallel texts is produced on a daily basis. However, there have been no attempts in using these texts for compiling a parallel corpus. A domain-specific sentence-aligned parallel Croatian-Italian corpus of administrative texts would be of high value in creating different language tools and resources. The aim of this paper is, therefore, to explore the value of parallel documents which are publicly available mostly in pdf format and to investigate the use of automatically-built dictionaries in corpus compilation. The effects that a document format and, consequently sentence splitting, and the dictionary input have on the sentence alignment process are manually evaluated.</abstract>
<identifier type="citekey">brkic-bakaric-lalli-pacelat-2019-parallel</identifier>
<identifier type="doi">10.26615/issn.2683-0078.2019_002</identifier>
<location>
<url>https://aclanthology.org/W19-8702</url>
</location>
<part>
<date>2019-09</date>
<extent unit="page">
<start>11</start>
<end>18</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Parallel Corpus of Croatian-Italian Administrative Texts
%A Brkic Bakaric, Marija
%A Lalli Pacelat, Ivana
%S Proceedings of the Human-Informed Translation and Interpreting Technology Workshop (HiT-IT 2019)
%D 2019
%8 September
%I Incoma Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F brkic-bakaric-lalli-pacelat-2019-parallel
%X Parallel corpora constitute a unique re-source for providing assistance to human translators. The selection and preparation of the parallel corpora also conditions the quality of the resulting MT engine. Since Croatian is a national language and Italian is officially recognized as a minority lan-guage in seven cities and twelve munici-palities of Istria County, a large amount of parallel texts is produced on a daily basis. However, there have been no attempts in using these texts for compiling a parallel corpus. A domain-specific sentence-aligned parallel Croatian-Italian corpus of administrative texts would be of high value in creating different language tools and resources. The aim of this paper is, therefore, to explore the value of parallel documents which are publicly available mostly in pdf format and to investigate the use of automatically-built dictionaries in corpus compilation. The effects that a document format and, consequently sentence splitting, and the dictionary input have on the sentence alignment process are manually evaluated.
%R 10.26615/issn.2683-0078.2019_002
%U https://aclanthology.org/W19-8702
%U https://doi.org/10.26615/issn.2683-0078.2019_002
%P 11-18
Markdown (Informal)
[Parallel Corpus of Croatian-Italian Administrative Texts](https://aclanthology.org/W19-8702) (Brkic Bakaric & Lalli Pacelat, RANLP 2019)
ACL