@inproceedings{simig-etal-2022-text,
title = "Text Characterization Toolkit ({TCT})",
author = "Simig, Daniel and
Wang, Tianlu and
Dankers, Verna and
Henderson, Peter and
Batsuren, Khuyagbaatar and
Hupkes, Dieuwke and
Diab, Mona",
editor = "Buntine, Wray and
Liakata, Maria",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing: System Demonstrations",
month = nov,
year = "2022",
address = "Taipei, Taiwan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.aacl-demo.9/",
doi = "10.18653/v1/2022.aacl-demo.9",
pages = "72--87",
abstract = "We present a tool, Text Characterization Toolkit (TCT), that researchers can use to study characteristics of large datasets. Furthermore, such properties can lead to understanding the influence of such attributes on models' behaviour. Traditionally, in most NLP research, models are usually evaluated by reporting single-number performance scores on a number of readily available benchmarks, without much deeper analysis. Here, we argue that {--} especially given the well-known fact that benchmarks often contain biases, artefacts, and spurious correlations {--} deeper results analysis should become the de-facto standard when presenting new models or benchmarks. TCT aims at filling this gap by facilitating such deeper analysis for datasets at scale, where datasets can be for training/development/evaluation. TCT includes both an easy-to-use tool, as well as off-the-shelf scripts that can be used for specific analyses. We also present use-cases from several different domains. TCT is used to predict difficult examples for given well-known trained models; TCT is also used to identify (potentially harmful) biases present in a dataset."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="simig-etal-2022-text">
<titleInfo>
<title>Text Characterization Toolkit (TCT)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Simig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianlu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verna</namePart>
<namePart type="family">Dankers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Henderson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khuyagbaatar</namePart>
<namePart type="family">Batsuren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dieuwke</namePart>
<namePart type="family">Hupkes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mona</namePart>
<namePart type="family">Diab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wray</namePart>
<namePart type="family">Buntine</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Taipei, Taiwan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a tool, Text Characterization Toolkit (TCT), that researchers can use to study characteristics of large datasets. Furthermore, such properties can lead to understanding the influence of such attributes on models’ behaviour. Traditionally, in most NLP research, models are usually evaluated by reporting single-number performance scores on a number of readily available benchmarks, without much deeper analysis. Here, we argue that – especially given the well-known fact that benchmarks often contain biases, artefacts, and spurious correlations – deeper results analysis should become the de-facto standard when presenting new models or benchmarks. TCT aims at filling this gap by facilitating such deeper analysis for datasets at scale, where datasets can be for training/development/evaluation. TCT includes both an easy-to-use tool, as well as off-the-shelf scripts that can be used for specific analyses. We also present use-cases from several different domains. TCT is used to predict difficult examples for given well-known trained models; TCT is also used to identify (potentially harmful) biases present in a dataset.</abstract>
<identifier type="citekey">simig-etal-2022-text</identifier>
<identifier type="doi">10.18653/v1/2022.aacl-demo.9</identifier>
<location>
<url>https://aclanthology.org/2022.aacl-demo.9/</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>72</start>
<end>87</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Text Characterization Toolkit (TCT)
%A Simig, Daniel
%A Wang, Tianlu
%A Dankers, Verna
%A Henderson, Peter
%A Batsuren, Khuyagbaatar
%A Hupkes, Dieuwke
%A Diab, Mona
%Y Buntine, Wray
%Y Liakata, Maria
%S Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing: System Demonstrations
%D 2022
%8 November
%I Association for Computational Linguistics
%C Taipei, Taiwan
%F simig-etal-2022-text
%X We present a tool, Text Characterization Toolkit (TCT), that researchers can use to study characteristics of large datasets. Furthermore, such properties can lead to understanding the influence of such attributes on models’ behaviour. Traditionally, in most NLP research, models are usually evaluated by reporting single-number performance scores on a number of readily available benchmarks, without much deeper analysis. Here, we argue that – especially given the well-known fact that benchmarks often contain biases, artefacts, and spurious correlations – deeper results analysis should become the de-facto standard when presenting new models or benchmarks. TCT aims at filling this gap by facilitating such deeper analysis for datasets at scale, where datasets can be for training/development/evaluation. TCT includes both an easy-to-use tool, as well as off-the-shelf scripts that can be used for specific analyses. We also present use-cases from several different domains. TCT is used to predict difficult examples for given well-known trained models; TCT is also used to identify (potentially harmful) biases present in a dataset.
%R 10.18653/v1/2022.aacl-demo.9
%U https://aclanthology.org/2022.aacl-demo.9/
%U https://doi.org/10.18653/v1/2022.aacl-demo.9
%P 72-87
Markdown (Informal)
[Text Characterization Toolkit (TCT)](https://aclanthology.org/2022.aacl-demo.9/) (Simig et al., AACL-IJCNLP 2022)
ACL
- Daniel Simig, Tianlu Wang, Verna Dankers, Peter Henderson, Khuyagbaatar Batsuren, Dieuwke Hupkes, and Mona Diab. 2022. Text Characterization Toolkit (TCT). In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing: System Demonstrations, pages 72–87, Taipei, Taiwan. Association for Computational Linguistics.