@inproceedings{prevot-wang-2024-experimenting,
title = "Experimenting with Discourse Segmentation of {T}aiwan {S}outhern {M}in Spontaneous Speech",
author = "Pr{\'e}vot, Laurent and
Wang, Sheng-Fu",
editor = "Strube, Michael and
Braud, Chloe and
Hardmeier, Christian and
Li, Junyi Jessy and
Loaiciga, Sharid and
Zeldes, Amir and
Li, Chuyuan",
booktitle = "Proceedings of the 5th Workshop on Computational Approaches to Discourse (CODI 2024)",
month = mar,
year = "2024",
address = "St. Julians, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.codi-1.5",
pages = "50--63",
abstract = "Discourse segmentation received increased attention in the past years, however the majority of studies have focused on written genres and with high-resource languages. This paper investigates discourse segmentation of a Taiwan Southern Min spontaneous speech corpus. We compare the fine-tuning a Language Model (LLM using two approaches: supervised, thanks to a high-quality annotated dataset, and weakly-supervised, requiring only a small amount of manual labeling. The corpus used here is transcribed with both Chinese characters and romanized transcription. This allows us to compare the impact of the written form on the discourse segmentation task. Additionally, the dataset includes manual prosodic breaks labeling, allowing an exploration of the role prosody can play in contemporary discourse segmentation systems grounded in LLMs. In our study, the supervised approach outperforms weak-supervision ; character-based version demonstrated better scores compared to the romanized version; and prosodic information proved to be an interesting source to increase discourse segmentation performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="prevot-wang-2024-experimenting">
<titleInfo>
<title>Experimenting with Discourse Segmentation of Taiwan Southern Min Spontaneous Speech</title>
</titleInfo>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Prévot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sheng-Fu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Computational Approaches to Discourse (CODI 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Strube</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chloe</namePart>
<namePart type="family">Braud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Hardmeier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junyi</namePart>
<namePart type="given">Jessy</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharid</namePart>
<namePart type="family">Loaiciga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zeldes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuyuan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julians, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Discourse segmentation received increased attention in the past years, however the majority of studies have focused on written genres and with high-resource languages. This paper investigates discourse segmentation of a Taiwan Southern Min spontaneous speech corpus. We compare the fine-tuning a Language Model (LLM using two approaches: supervised, thanks to a high-quality annotated dataset, and weakly-supervised, requiring only a small amount of manual labeling. The corpus used here is transcribed with both Chinese characters and romanized transcription. This allows us to compare the impact of the written form on the discourse segmentation task. Additionally, the dataset includes manual prosodic breaks labeling, allowing an exploration of the role prosody can play in contemporary discourse segmentation systems grounded in LLMs. In our study, the supervised approach outperforms weak-supervision ; character-based version demonstrated better scores compared to the romanized version; and prosodic information proved to be an interesting source to increase discourse segmentation performance.</abstract>
<identifier type="citekey">prevot-wang-2024-experimenting</identifier>
<location>
<url>https://aclanthology.org/2024.codi-1.5</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>50</start>
<end>63</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Experimenting with Discourse Segmentation of Taiwan Southern Min Spontaneous Speech
%A Prévot, Laurent
%A Wang, Sheng-Fu
%Y Strube, Michael
%Y Braud, Chloe
%Y Hardmeier, Christian
%Y Li, Junyi Jessy
%Y Loaiciga, Sharid
%Y Zeldes, Amir
%Y Li, Chuyuan
%S Proceedings of the 5th Workshop on Computational Approaches to Discourse (CODI 2024)
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julians, Malta
%F prevot-wang-2024-experimenting
%X Discourse segmentation received increased attention in the past years, however the majority of studies have focused on written genres and with high-resource languages. This paper investigates discourse segmentation of a Taiwan Southern Min spontaneous speech corpus. We compare the fine-tuning a Language Model (LLM using two approaches: supervised, thanks to a high-quality annotated dataset, and weakly-supervised, requiring only a small amount of manual labeling. The corpus used here is transcribed with both Chinese characters and romanized transcription. This allows us to compare the impact of the written form on the discourse segmentation task. Additionally, the dataset includes manual prosodic breaks labeling, allowing an exploration of the role prosody can play in contemporary discourse segmentation systems grounded in LLMs. In our study, the supervised approach outperforms weak-supervision ; character-based version demonstrated better scores compared to the romanized version; and prosodic information proved to be an interesting source to increase discourse segmentation performance.
%U https://aclanthology.org/2024.codi-1.5
%P 50-63
Markdown (Informal)
[Experimenting with Discourse Segmentation of Taiwan Southern Min Spontaneous Speech](https://aclanthology.org/2024.codi-1.5) (Prévot & Wang, CODI-WS 2024)
ACL