@inproceedings{born-etal-2023-learning,
title = "Learning the Character Inventories of Undeciphered Scripts Using Unsupervised Deep Clustering",
author = "Born, Logan and
Monroe, M. Willis and
Kelley, Kathryn and
Sarkar, Anoop",
editor = "Gorman, Kyle and
Sproat, Richard and
Roark, Brian",
booktitle = "Proceedings of the Workshop on Computation and Written Language (CAWL 2023)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.cawl-1.11/",
doi = "10.18653/v1/2023.cawl-1.11",
pages = "92--104",
abstract = "A crucial step in deciphering a text is to identify what set of characters were used to write it. This requires grouping character tokens according to visual and contextual features, which can be challenging for human analysts when the number of tokens or underlying types is large. Prior work has shown that this process can be automated by clustering dense representations of character images, in a task which we call {\textquotedblleft}script clustering{\textquotedblright}. In this work, we present novel architectures which exploit varying degrees of contextual and visual information to learn representations for use in script clustering. We evaluate on a range of modern and ancient scripts, and find that our models produce representations which are more effective for script recovery than the current state-of-the-art, despite using just {\textasciitilde}2{\%} as many parameters. Our analysis fruitfully applies these models to assess hypotheses about the character inventory of the partially-deciphered proto-Elamite script."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="born-etal-2023-learning">
<titleInfo>
<title>Learning the Character Inventories of Undeciphered Scripts Using Unsupervised Deep Clustering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Logan</namePart>
<namePart type="family">Born</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">M</namePart>
<namePart type="given">Willis</namePart>
<namePart type="family">Monroe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kathryn</namePart>
<namePart type="family">Kelley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Sarkar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Computation and Written Language (CAWL 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Gorman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Sproat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Roark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A crucial step in deciphering a text is to identify what set of characters were used to write it. This requires grouping character tokens according to visual and contextual features, which can be challenging for human analysts when the number of tokens or underlying types is large. Prior work has shown that this process can be automated by clustering dense representations of character images, in a task which we call “script clustering”. In this work, we present novel architectures which exploit varying degrees of contextual and visual information to learn representations for use in script clustering. We evaluate on a range of modern and ancient scripts, and find that our models produce representations which are more effective for script recovery than the current state-of-the-art, despite using just ~2% as many parameters. Our analysis fruitfully applies these models to assess hypotheses about the character inventory of the partially-deciphered proto-Elamite script.</abstract>
<identifier type="citekey">born-etal-2023-learning</identifier>
<identifier type="doi">10.18653/v1/2023.cawl-1.11</identifier>
<location>
<url>https://aclanthology.org/2023.cawl-1.11/</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>92</start>
<end>104</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning the Character Inventories of Undeciphered Scripts Using Unsupervised Deep Clustering
%A Born, Logan
%A Monroe, M. Willis
%A Kelley, Kathryn
%A Sarkar, Anoop
%Y Gorman, Kyle
%Y Sproat, Richard
%Y Roark, Brian
%S Proceedings of the Workshop on Computation and Written Language (CAWL 2023)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F born-etal-2023-learning
%X A crucial step in deciphering a text is to identify what set of characters were used to write it. This requires grouping character tokens according to visual and contextual features, which can be challenging for human analysts when the number of tokens or underlying types is large. Prior work has shown that this process can be automated by clustering dense representations of character images, in a task which we call “script clustering”. In this work, we present novel architectures which exploit varying degrees of contextual and visual information to learn representations for use in script clustering. We evaluate on a range of modern and ancient scripts, and find that our models produce representations which are more effective for script recovery than the current state-of-the-art, despite using just ~2% as many parameters. Our analysis fruitfully applies these models to assess hypotheses about the character inventory of the partially-deciphered proto-Elamite script.
%R 10.18653/v1/2023.cawl-1.11
%U https://aclanthology.org/2023.cawl-1.11/
%U https://doi.org/10.18653/v1/2023.cawl-1.11
%P 92-104
Markdown (Informal)
[Learning the Character Inventories of Undeciphered Scripts Using Unsupervised Deep Clustering](https://aclanthology.org/2023.cawl-1.11/) (Born et al., CAWL 2023)
ACL