@article{garimella-etal-2023-reflection,
title = "Reflection of Demographic Background on Word Usage",
author = "Garimella, Aparna and
Banea, Carmen and
Mihalcea, Rada",
journal = "Computational Linguistics",
volume = "49",
number = "2",
month = jun,
year = "2023",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2023.cl-2.4",
doi = "10.1162/coli_a_00475",
pages = "373--394",
abstract = "The availability of personal writings in electronic format provides researchers in the fields of linguistics, psychology, and computational linguistics with an unprecedented chance to study, on a large scale, the relationship between language use and the demographic background of writers, allowing us to better understand people across different demographics. In this article, we analyze the relation between language and demographics by developing cross-demographic word models to identify words with usage bias, or words that are used in significantly different ways by speakers of different demographics. Focusing on three demographic categories, namely, location, gender, and industry, we identify words with significant usage differences in each category and investigate various approaches of encoding a word{'}s usage, allowing us to identify language aspects that contribute to the differences. Our word models using topic-based features achieve at least 20{\%} improvement in accuracy over the baseline for all demographic categories, even for scenarios with classification into 15 categories, illustrating the usefulness of topic-based features in identifying word usage differences. Further, we note that for location and industry, topics extracted from immediate context are the best predictors of word usages, hinting at the importance of word meaning and its grammatical function for these demographics, while for gender, topics obtained from longer contexts are better predictors for word usage.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="garimella-etal-2023-reflection">
<titleInfo>
<title>Reflection of Demographic Background on Word Usage</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aparna</namePart>
<namePart type="family">Garimella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carmen</namePart>
<namePart type="family">Banea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rada</namePart>
<namePart type="family">Mihalcea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>The availability of personal writings in electronic format provides researchers in the fields of linguistics, psychology, and computational linguistics with an unprecedented chance to study, on a large scale, the relationship between language use and the demographic background of writers, allowing us to better understand people across different demographics. In this article, we analyze the relation between language and demographics by developing cross-demographic word models to identify words with usage bias, or words that are used in significantly different ways by speakers of different demographics. Focusing on three demographic categories, namely, location, gender, and industry, we identify words with significant usage differences in each category and investigate various approaches of encoding a word’s usage, allowing us to identify language aspects that contribute to the differences. Our word models using topic-based features achieve at least 20% improvement in accuracy over the baseline for all demographic categories, even for scenarios with classification into 15 categories, illustrating the usefulness of topic-based features in identifying word usage differences. Further, we note that for location and industry, topics extracted from immediate context are the best predictors of word usages, hinting at the importance of word meaning and its grammatical function for these demographics, while for gender, topics obtained from longer contexts are better predictors for word usage.</abstract>
<identifier type="citekey">garimella-etal-2023-reflection</identifier>
<identifier type="doi">10.1162/coli_a_00475</identifier>
<location>
<url>https://aclanthology.org/2023.cl-2.4</url>
</location>
<part>
<date>2023-06</date>
<detail type="volume"><number>49</number></detail>
<detail type="issue"><number>2</number></detail>
<extent unit="page">
<start>373</start>
<end>394</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Reflection of Demographic Background on Word Usage
%A Garimella, Aparna
%A Banea, Carmen
%A Mihalcea, Rada
%J Computational Linguistics
%D 2023
%8 June
%V 49
%N 2
%I MIT Press
%C Cambridge, MA
%F garimella-etal-2023-reflection
%X The availability of personal writings in electronic format provides researchers in the fields of linguistics, psychology, and computational linguistics with an unprecedented chance to study, on a large scale, the relationship between language use and the demographic background of writers, allowing us to better understand people across different demographics. In this article, we analyze the relation between language and demographics by developing cross-demographic word models to identify words with usage bias, or words that are used in significantly different ways by speakers of different demographics. Focusing on three demographic categories, namely, location, gender, and industry, we identify words with significant usage differences in each category and investigate various approaches of encoding a word’s usage, allowing us to identify language aspects that contribute to the differences. Our word models using topic-based features achieve at least 20% improvement in accuracy over the baseline for all demographic categories, even for scenarios with classification into 15 categories, illustrating the usefulness of topic-based features in identifying word usage differences. Further, we note that for location and industry, topics extracted from immediate context are the best predictors of word usages, hinting at the importance of word meaning and its grammatical function for these demographics, while for gender, topics obtained from longer contexts are better predictors for word usage.
%R 10.1162/coli_a_00475
%U https://aclanthology.org/2023.cl-2.4
%U https://doi.org/10.1162/coli_a_00475
%P 373-394
Markdown (Informal)
[Reflection of Demographic Background on Word Usage](https://aclanthology.org/2023.cl-2.4) (Garimella et al., CL 2023)
ACL