@inproceedings{banon-etal-2020-paracrawl,
title = "{P}ara{C}rawl: Web-Scale Acquisition of Parallel Corpora",
author = "Ba{\~n}{\'o}n, Marta and
Chen, Pinzhen and
Haddow, Barry and
Heafield, Kenneth and
Hoang, Hieu and
Espl{\`a}-Gomis, Miquel and
Forcada, Mikel L. and
Kamran, Amir and
Kirefu, Faheem and
Koehn, Philipp and
Ortiz Rojas, Sergio and
Pla Sempere, Leopoldo and
Ram{\'\i}rez-S{\'a}nchez, Gema and
Sarr{\'\i}as, Elsa and
Strelec, Marek and
Thompson, Brian and
Waites, William and
Wiggins, Dion and
Zaragoza, Jaume",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.417",
doi = "10.18653/v1/2020.acl-main.417",
pages = "4555--4567",
abstract = "We report on methods to create the largest publicly available parallel corpora by crawling the web, using open source software. We empirically compare alternative methods and publish benchmark data sets for sentence alignment and sentence pair filtering. We also describe the parallel corpora released and evaluate their quality and their usefulness to create machine translation systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="banon-etal-2020-paracrawl">
<titleInfo>
<title>ParaCrawl: Web-Scale Acquisition of Parallel Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="family">Bañón</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Heafield</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hieu</namePart>
<namePart type="family">Hoang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miquel</namePart>
<namePart type="family">Esplà-Gomis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Kamran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Faheem</namePart>
<namePart type="family">Kirefu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Ortiz Rojas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leopoldo</namePart>
<namePart type="family">Pla Sempere</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gema</namePart>
<namePart type="family">Ramírez-Sánchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elsa</namePart>
<namePart type="family">Sarrías</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marek</namePart>
<namePart type="family">Strelec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Waites</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dion</namePart>
<namePart type="family">Wiggins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaume</namePart>
<namePart type="family">Zaragoza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Jurafsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Chai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalie</namePart>
<namePart type="family">Schluter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We report on methods to create the largest publicly available parallel corpora by crawling the web, using open source software. We empirically compare alternative methods and publish benchmark data sets for sentence alignment and sentence pair filtering. We also describe the parallel corpora released and evaluate their quality and their usefulness to create machine translation systems.</abstract>
<identifier type="citekey">banon-etal-2020-paracrawl</identifier>
<identifier type="doi">10.18653/v1/2020.acl-main.417</identifier>
<location>
<url>https://aclanthology.org/2020.acl-main.417</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>4555</start>
<end>4567</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ParaCrawl: Web-Scale Acquisition of Parallel Corpora
%A Bañón, Marta
%A Chen, Pinzhen
%A Haddow, Barry
%A Heafield, Kenneth
%A Hoang, Hieu
%A Esplà-Gomis, Miquel
%A Forcada, Mikel L.
%A Kamran, Amir
%A Kirefu, Faheem
%A Koehn, Philipp
%A Ortiz Rojas, Sergio
%A Pla Sempere, Leopoldo
%A Ramírez-Sánchez, Gema
%A Sarrías, Elsa
%A Strelec, Marek
%A Thompson, Brian
%A Waites, William
%A Wiggins, Dion
%A Zaragoza, Jaume
%Y Jurafsky, Dan
%Y Chai, Joyce
%Y Schluter, Natalie
%Y Tetreault, Joel
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics
%D 2020
%8 July
%I Association for Computational Linguistics
%C Online
%F banon-etal-2020-paracrawl
%X We report on methods to create the largest publicly available parallel corpora by crawling the web, using open source software. We empirically compare alternative methods and publish benchmark data sets for sentence alignment and sentence pair filtering. We also describe the parallel corpora released and evaluate their quality and their usefulness to create machine translation systems.
%R 10.18653/v1/2020.acl-main.417
%U https://aclanthology.org/2020.acl-main.417
%U https://doi.org/10.18653/v1/2020.acl-main.417
%P 4555-4567
Markdown (Informal)
[ParaCrawl: Web-Scale Acquisition of Parallel Corpora](https://aclanthology.org/2020.acl-main.417) (Bañón et al., ACL 2020)
ACL
- Marta Bañón, Pinzhen Chen, Barry Haddow, Kenneth Heafield, Hieu Hoang, Miquel Esplà-Gomis, Mikel L. Forcada, Amir Kamran, Faheem Kirefu, Philipp Koehn, Sergio Ortiz Rojas, Leopoldo Pla Sempere, Gema Ramírez-Sánchez, Elsa Sarrías, Marek Strelec, Brian Thompson, William Waites, Dion Wiggins, and Jaume Zaragoza. 2020. ParaCrawl: Web-Scale Acquisition of Parallel Corpora. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 4555–4567, Online. Association for Computational Linguistics.