@inproceedings{nguyen-thanh-bui-doan-2006-word,
title = "Word Segmentation for {V}ietnamese Text Categorization An {I}nternet-based Statistic and Genetic Algorithm Approach",
author = "Nguyen Thanh, Hung and
Bui Doan, Khanh",
editor = "Mertens, Piet and
Fairon, C{\'e}drick and
Dister, Anne and
Watrin, Patrick",
booktitle = "Actes de la 13{\`e}me conf{\'e}rence sur le Traitement Automatique des Langues Naturelles. Posters",
month = apr,
year = "2006",
address = "Leuven, Belgique",
publisher = "ATALA",
url = "https://aclanthology.org/2006.jeptalnrecital-poster.20/",
pages = "561--570",
abstract = "This paper suggests a novel Vietnamese segmentation approach for text categorization. Instead of using an annotated training corpus or a lexicon which are still lacking in Vietnamese, we use both statistical information extracted directly from a commercial search engine and a genetic algorithm to find the optimal routes to segmentation. The extracted information includes document frequency and n-gram mutual information. Our experiment results obtained on the segmentation and categorization of online news abstracts are very promising. It matches near 80 {\%} human judgment on segmentation and over 90 {\%} micro-averaging F1 in categorization. The processing time is less than one second per document when statistical information is cached."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-thanh-bui-doan-2006-word">
<titleInfo>
<title>Word Segmentation for Vietnamese Text Categorization An Internet-based Statistic and Genetic Algorithm Approach</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hung</namePart>
<namePart type="family">Nguyen Thanh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khanh</namePart>
<namePart type="family">Bui Doan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2006-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Actes de la 13ème conférence sur le Traitement Automatique des Langues Naturelles. Posters</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piet</namePart>
<namePart type="family">Mertens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cédrick</namePart>
<namePart type="family">Fairon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anne</namePart>
<namePart type="family">Dister</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Watrin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ATALA</publisher>
<place>
<placeTerm type="text">Leuven, Belgique</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper suggests a novel Vietnamese segmentation approach for text categorization. Instead of using an annotated training corpus or a lexicon which are still lacking in Vietnamese, we use both statistical information extracted directly from a commercial search engine and a genetic algorithm to find the optimal routes to segmentation. The extracted information includes document frequency and n-gram mutual information. Our experiment results obtained on the segmentation and categorization of online news abstracts are very promising. It matches near 80 % human judgment on segmentation and over 90 % micro-averaging F1 in categorization. The processing time is less than one second per document when statistical information is cached.</abstract>
<identifier type="citekey">nguyen-thanh-bui-doan-2006-word</identifier>
<location>
<url>https://aclanthology.org/2006.jeptalnrecital-poster.20/</url>
</location>
<part>
<date>2006-04</date>
<extent unit="page">
<start>561</start>
<end>570</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Word Segmentation for Vietnamese Text Categorization An Internet-based Statistic and Genetic Algorithm Approach
%A Nguyen Thanh, Hung
%A Bui Doan, Khanh
%Y Mertens, Piet
%Y Fairon, Cédrick
%Y Dister, Anne
%Y Watrin, Patrick
%S Actes de la 13ème conférence sur le Traitement Automatique des Langues Naturelles. Posters
%D 2006
%8 April
%I ATALA
%C Leuven, Belgique
%F nguyen-thanh-bui-doan-2006-word
%X This paper suggests a novel Vietnamese segmentation approach for text categorization. Instead of using an annotated training corpus or a lexicon which are still lacking in Vietnamese, we use both statistical information extracted directly from a commercial search engine and a genetic algorithm to find the optimal routes to segmentation. The extracted information includes document frequency and n-gram mutual information. Our experiment results obtained on the segmentation and categorization of online news abstracts are very promising. It matches near 80 % human judgment on segmentation and over 90 % micro-averaging F1 in categorization. The processing time is less than one second per document when statistical information is cached.
%U https://aclanthology.org/2006.jeptalnrecital-poster.20/
%P 561-570
Markdown (Informal)
[Word Segmentation for Vietnamese Text Categorization An Internet-based Statistic and Genetic Algorithm Approach](https://aclanthology.org/2006.jeptalnrecital-poster.20/) (Nguyen Thanh & Bui Doan, JEP/TALN/RECITAL 2006)
ACL