@inproceedings{boulianne-2022-phoneme,
title = "Phoneme transcription of endangered languages: an evaluation of recent {ASR} architectures in the single speaker scenario",
author = "Boulianne, Gilles",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-acl.180",
doi = "10.18653/v1/2022.findings-acl.180",
pages = "2301--2308",
abstract = "Transcription is often reported as the bottleneck in endangered language documentation, requiring large efforts from scarce speakers and transcribers. In general, automatic speech recognition (ASR) can be accurate enough to accelerate transcription only if trained on large amounts of transcribed data. However, when a single speaker is involved, several studies have reported encouraging results for phonetic transcription even with small amounts of training. Here we expand this body of work on speaker-dependent transcription by comparing four ASR approaches, notably recent transformer and pretrained multilingual models, on a common dataset of 11 languages. To automate data preparation, training and evaluation steps, we also developed a phoneme recognition setup which handles morphologically complex languages and writing systems for which no pronunciation dictionary exists. We find that fine-tuning a multilingual pretrained model yields an average phoneme error rate (PER) of 15{\%} for 6 languages with 99 minutes or less of transcribed data for training. For the 5 languages with between 100 and 192 minutes of training, we achieved a PER of 8.4{\%} or less. These results on a number of varied languages suggest that ASR can now significantly reduce transcription efforts in the speaker-dependent situation common in endangered language work.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="boulianne-2022-phoneme">
<titleInfo>
<title>Phoneme transcription of endangered languages: an evaluation of recent ASR architectures in the single speaker scenario</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gilles</namePart>
<namePart type="family">Boulianne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transcription is often reported as the bottleneck in endangered language documentation, requiring large efforts from scarce speakers and transcribers. In general, automatic speech recognition (ASR) can be accurate enough to accelerate transcription only if trained on large amounts of transcribed data. However, when a single speaker is involved, several studies have reported encouraging results for phonetic transcription even with small amounts of training. Here we expand this body of work on speaker-dependent transcription by comparing four ASR approaches, notably recent transformer and pretrained multilingual models, on a common dataset of 11 languages. To automate data preparation, training and evaluation steps, we also developed a phoneme recognition setup which handles morphologically complex languages and writing systems for which no pronunciation dictionary exists. We find that fine-tuning a multilingual pretrained model yields an average phoneme error rate (PER) of 15% for 6 languages with 99 minutes or less of transcribed data for training. For the 5 languages with between 100 and 192 minutes of training, we achieved a PER of 8.4% or less. These results on a number of varied languages suggest that ASR can now significantly reduce transcription efforts in the speaker-dependent situation common in endangered language work.</abstract>
<identifier type="citekey">boulianne-2022-phoneme</identifier>
<identifier type="doi">10.18653/v1/2022.findings-acl.180</identifier>
<location>
<url>https://aclanthology.org/2022.findings-acl.180</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>2301</start>
<end>2308</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Phoneme transcription of endangered languages: an evaluation of recent ASR architectures in the single speaker scenario
%A Boulianne, Gilles
%Y Muresan, Smaranda
%Y Nakov, Preslav
%Y Villavicencio, Aline
%S Findings of the Association for Computational Linguistics: ACL 2022
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F boulianne-2022-phoneme
%X Transcription is often reported as the bottleneck in endangered language documentation, requiring large efforts from scarce speakers and transcribers. In general, automatic speech recognition (ASR) can be accurate enough to accelerate transcription only if trained on large amounts of transcribed data. However, when a single speaker is involved, several studies have reported encouraging results for phonetic transcription even with small amounts of training. Here we expand this body of work on speaker-dependent transcription by comparing four ASR approaches, notably recent transformer and pretrained multilingual models, on a common dataset of 11 languages. To automate data preparation, training and evaluation steps, we also developed a phoneme recognition setup which handles morphologically complex languages and writing systems for which no pronunciation dictionary exists. We find that fine-tuning a multilingual pretrained model yields an average phoneme error rate (PER) of 15% for 6 languages with 99 minutes or less of transcribed data for training. For the 5 languages with between 100 and 192 minutes of training, we achieved a PER of 8.4% or less. These results on a number of varied languages suggest that ASR can now significantly reduce transcription efforts in the speaker-dependent situation common in endangered language work.
%R 10.18653/v1/2022.findings-acl.180
%U https://aclanthology.org/2022.findings-acl.180
%U https://doi.org/10.18653/v1/2022.findings-acl.180
%P 2301-2308
Markdown (Informal)
[Phoneme transcription of endangered languages: an evaluation of recent ASR architectures in the single speaker scenario](https://aclanthology.org/2022.findings-acl.180) (Boulianne, Findings 2022)
ACL