@inproceedings{giguet-lucas-2022-greyc,
title = "{GREYC}@{F}in{TOC}-2022: Handling Document Layout and Structure in Native {PDF} Bundle of Documents",
author = "Giguet, Emmanuel and
Lucas, Nadine",
editor = "El-Haj, Mahmoud and
Rayson, Paul and
Zmandar, Nadhem",
booktitle = "Proceedings of the 4th Financial Narrative Processing Workshop @LREC2022",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.fnp-1.15/",
pages = "100--104",
abstract = "n this paper, we present our contribution to the FinTOC-2022 Shared Task {\textquotedblleft}Financial Document Structure Extraction{\textquotedblright}. We participated in the three tracks dedicated to English, French and Spanish document processing. Our main contribution consists in considering financial prospectus as a bundle of documents, i.e., a set of merged documents, each with their own layout and structure. Therefore, Document Layout and Structure Analysis (DLSA) first starts with the boundary detection of each document using general layout features. Then, the process applies inside each single document, taking advantage of the local properties. DLSA is achieved considering simultaneously text content, vectorial shapes and images embedded in the native PDF document. For the Title Detection task in English and French, we observed a significant improvement of the F-measures for Title Detection compared with those obtained during our previous participation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="giguet-lucas-2022-greyc">
<titleInfo>
<title>GREYC@FinTOC-2022: Handling Document Layout and Structure in Native PDF Bundle of Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emmanuel</namePart>
<namePart type="family">Giguet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadine</namePart>
<namePart type="family">Lucas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Financial Narrative Processing Workshop @LREC2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mahmoud</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadhem</namePart>
<namePart type="family">Zmandar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>n this paper, we present our contribution to the FinTOC-2022 Shared Task “Financial Document Structure Extraction”. We participated in the three tracks dedicated to English, French and Spanish document processing. Our main contribution consists in considering financial prospectus as a bundle of documents, i.e., a set of merged documents, each with their own layout and structure. Therefore, Document Layout and Structure Analysis (DLSA) first starts with the boundary detection of each document using general layout features. Then, the process applies inside each single document, taking advantage of the local properties. DLSA is achieved considering simultaneously text content, vectorial shapes and images embedded in the native PDF document. For the Title Detection task in English and French, we observed a significant improvement of the F-measures for Title Detection compared with those obtained during our previous participation.</abstract>
<identifier type="citekey">giguet-lucas-2022-greyc</identifier>
<location>
<url>https://aclanthology.org/2022.fnp-1.15/</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>100</start>
<end>104</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GREYC@FinTOC-2022: Handling Document Layout and Structure in Native PDF Bundle of Documents
%A Giguet, Emmanuel
%A Lucas, Nadine
%Y El-Haj, Mahmoud
%Y Rayson, Paul
%Y Zmandar, Nadhem
%S Proceedings of the 4th Financial Narrative Processing Workshop @LREC2022
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F giguet-lucas-2022-greyc
%X n this paper, we present our contribution to the FinTOC-2022 Shared Task “Financial Document Structure Extraction”. We participated in the three tracks dedicated to English, French and Spanish document processing. Our main contribution consists in considering financial prospectus as a bundle of documents, i.e., a set of merged documents, each with their own layout and structure. Therefore, Document Layout and Structure Analysis (DLSA) first starts with the boundary detection of each document using general layout features. Then, the process applies inside each single document, taking advantage of the local properties. DLSA is achieved considering simultaneously text content, vectorial shapes and images embedded in the native PDF document. For the Title Detection task in English and French, we observed a significant improvement of the F-measures for Title Detection compared with those obtained during our previous participation.
%U https://aclanthology.org/2022.fnp-1.15/
%P 100-104
Markdown (Informal)
[GREYC@FinTOC-2022: Handling Document Layout and Structure in Native PDF Bundle of Documents](https://aclanthology.org/2022.fnp-1.15/) (Giguet & Lucas, FNP 2022)
ACL