@inproceedings{bencke-etal-2024-inferbr,
title = "{I}nfer{BR}: A Natural Language Inference Dataset in {P}ortuguese",
author = "Bencke, Luciana and
Pereira, Francielle Vasconcellos and
Santos, Moniele Kunrath and
Moreira, Viviane",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.793",
pages = "9050--9060",
abstract = "Natural Language Inference semantic concepts are central to all aspects of natural language meaning. Portuguese has few NLI-annotated datasets created through automatic translation followed by manual checking. The manual creation of NLI datasets is complex and requires many efforts that are sometimes unavailable. Thus, investments to produce good quality synthetic instances that could be used to train machine learning models for NLI are welcome. This work produced InferBR, an NLI dataset for Portuguese. We relied on a semiautomatic process to generate premises and an automatic process to generate hypotheses. The dataset was manually revised, showing that 97.4{\%} of the sentence pairs had good quality, and nearly 100{\%} of the instances had the correct label assigned. The model trained with InferBR is better at recognizing entailment classes in the other Portuguese datasets than the reverse. Because of its diversity and many unique sentences, InferBR can potentially be further augmented. In addition to the dataset, a key contribution is our proposed generation processes for premises and hypotheses that can easily be adapted to other languages and tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bencke-etal-2024-inferbr">
<titleInfo>
<title>InferBR: A Natural Language Inference Dataset in Portuguese</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luciana</namePart>
<namePart type="family">Bencke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francielle</namePart>
<namePart type="given">Vasconcellos</namePart>
<namePart type="family">Pereira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moniele</namePart>
<namePart type="given">Kunrath</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Natural Language Inference semantic concepts are central to all aspects of natural language meaning. Portuguese has few NLI-annotated datasets created through automatic translation followed by manual checking. The manual creation of NLI datasets is complex and requires many efforts that are sometimes unavailable. Thus, investments to produce good quality synthetic instances that could be used to train machine learning models for NLI are welcome. This work produced InferBR, an NLI dataset for Portuguese. We relied on a semiautomatic process to generate premises and an automatic process to generate hypotheses. The dataset was manually revised, showing that 97.4% of the sentence pairs had good quality, and nearly 100% of the instances had the correct label assigned. The model trained with InferBR is better at recognizing entailment classes in the other Portuguese datasets than the reverse. Because of its diversity and many unique sentences, InferBR can potentially be further augmented. In addition to the dataset, a key contribution is our proposed generation processes for premises and hypotheses that can easily be adapted to other languages and tasks.</abstract>
<identifier type="citekey">bencke-etal-2024-inferbr</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.793</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>9050</start>
<end>9060</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T InferBR: A Natural Language Inference Dataset in Portuguese
%A Bencke, Luciana
%A Pereira, Francielle Vasconcellos
%A Santos, Moniele Kunrath
%A Moreira, Viviane
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F bencke-etal-2024-inferbr
%X Natural Language Inference semantic concepts are central to all aspects of natural language meaning. Portuguese has few NLI-annotated datasets created through automatic translation followed by manual checking. The manual creation of NLI datasets is complex and requires many efforts that are sometimes unavailable. Thus, investments to produce good quality synthetic instances that could be used to train machine learning models for NLI are welcome. This work produced InferBR, an NLI dataset for Portuguese. We relied on a semiautomatic process to generate premises and an automatic process to generate hypotheses. The dataset was manually revised, showing that 97.4% of the sentence pairs had good quality, and nearly 100% of the instances had the correct label assigned. The model trained with InferBR is better at recognizing entailment classes in the other Portuguese datasets than the reverse. Because of its diversity and many unique sentences, InferBR can potentially be further augmented. In addition to the dataset, a key contribution is our proposed generation processes for premises and hypotheses that can easily be adapted to other languages and tasks.
%U https://aclanthology.org/2024.lrec-main.793
%P 9050-9060
Markdown (Informal)
[InferBR: A Natural Language Inference Dataset in Portuguese](https://aclanthology.org/2024.lrec-main.793) (Bencke et al., LREC-COLING 2024)
ACL
- Luciana Bencke, Francielle Vasconcellos Pereira, Moniele Kunrath Santos, and Viviane Moreira. 2024. InferBR: A Natural Language Inference Dataset in Portuguese. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 9050–9060, Torino, Italia. ELRA and ICCL.