@inproceedings{zhang-etal-2022-stapi,
title = "{STAPI}: An Automatic Scraper for Extracting Iterative Title-Text Structure from Web Documents",
author = "Zhang, Nan and
Wilson, Shomir and
Mitra, Prasenjit",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.371/",
pages = "3461--3470",
abstract = "Formal documents often are organized into sections of text, each with a title, and extracting this structure remains an under-explored aspect of natural language processing. This iterative title-text structure is valuable data for building models for headline generation and section title generation, but there is no corpus that contains web documents annotated with titles and prose texts. Therefore, we propose the first title-text dataset on web documents that incorporates a wide variety of domains to facilitate downstream training. We also introduce STAPI (Section Title And Prose text Identifier), a two-step system for labeling section titles and prose text in HTML documents. To filter out unrelated content like document footers, its first step involves a filter that reads HTML documents and proposes a set of textual candidates. In the second step, a typographic classifier takes the candidates from the filter and categorizes each one into one of the three pre-defined classes (title, prose text, and miscellany). We show that STAPI significantly outperforms two baseline models in terms of title-text identification. We release our dataset along with a web application to facilitate supervised and semi-supervised training in this domain."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2022-stapi">
<titleInfo>
<title>STAPI: An Automatic Scraper for Extracting Iterative Title-Text Structure from Web Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shomir</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prasenjit</namePart>
<namePart type="family">Mitra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Formal documents often are organized into sections of text, each with a title, and extracting this structure remains an under-explored aspect of natural language processing. This iterative title-text structure is valuable data for building models for headline generation and section title generation, but there is no corpus that contains web documents annotated with titles and prose texts. Therefore, we propose the first title-text dataset on web documents that incorporates a wide variety of domains to facilitate downstream training. We also introduce STAPI (Section Title And Prose text Identifier), a two-step system for labeling section titles and prose text in HTML documents. To filter out unrelated content like document footers, its first step involves a filter that reads HTML documents and proposes a set of textual candidates. In the second step, a typographic classifier takes the candidates from the filter and categorizes each one into one of the three pre-defined classes (title, prose text, and miscellany). We show that STAPI significantly outperforms two baseline models in terms of title-text identification. We release our dataset along with a web application to facilitate supervised and semi-supervised training in this domain.</abstract>
<identifier type="citekey">zhang-etal-2022-stapi</identifier>
<location>
<url>https://aclanthology.org/2022.lrec-1.371/</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>3461</start>
<end>3470</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T STAPI: An Automatic Scraper for Extracting Iterative Title-Text Structure from Web Documents
%A Zhang, Nan
%A Wilson, Shomir
%A Mitra, Prasenjit
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Thirteenth Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F zhang-etal-2022-stapi
%X Formal documents often are organized into sections of text, each with a title, and extracting this structure remains an under-explored aspect of natural language processing. This iterative title-text structure is valuable data for building models for headline generation and section title generation, but there is no corpus that contains web documents annotated with titles and prose texts. Therefore, we propose the first title-text dataset on web documents that incorporates a wide variety of domains to facilitate downstream training. We also introduce STAPI (Section Title And Prose text Identifier), a two-step system for labeling section titles and prose text in HTML documents. To filter out unrelated content like document footers, its first step involves a filter that reads HTML documents and proposes a set of textual candidates. In the second step, a typographic classifier takes the candidates from the filter and categorizes each one into one of the three pre-defined classes (title, prose text, and miscellany). We show that STAPI significantly outperforms two baseline models in terms of title-text identification. We release our dataset along with a web application to facilitate supervised and semi-supervised training in this domain.
%U https://aclanthology.org/2022.lrec-1.371/
%P 3461-3470
Markdown (Informal)
[STAPI: An Automatic Scraper for Extracting Iterative Title-Text Structure from Web Documents](https://aclanthology.org/2022.lrec-1.371/) (Zhang et al., LREC 2022)
ACL