@inproceedings{siyuan-etal-2024-layoutpointer,
title = "{L}ayout{P}ointer: A Spatial-Context Adaptive Pointer Network for Visual Information Extraction",
author = "Siyuan, Huang and
Xiong, Yongping and
Guibin, Wu",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.207/",
doi = "10.18653/v1/2024.naacl-long.207",
pages = "3737--3748",
abstract = "Visual Information Extraction (VIE), as a crucial task of Document Intelligence, involves two primary sub-tasks: Semantic Entity Recognition (SER) and Relation Extraction (RE). However, VIE faces two significant challenges. Firstly, most existing models inadequately utilize spatial information of entities, often failing to predict connections or incorrectly linking spatially distant entities. Secondly, the improper input order of tokens challenges in extracting complete entity pairs from documents with multi-line entities when text is extracted via PDF parser or OCR. To address these challenges, we propose LayoutPointer, a Spatial-Context Adaptive Pointer Network. LayoutPointer explicitly enhances spatial-context relationships by incorporating 2D relative position information and adaptive spatial constraints within self-attention. Furthermore, we recast the RE task as a specialized cycle detection problem, employing a unique tail-to-head pointer to restore the semantic order among multi-line entities. To better evaluate the effectiveness of our proposed method, we reconstruct a multi-line dataset named MLFUD, which more accurately reflects real-world scenarios. Fine-tuning experimental results on FUNSD, XFUND, and MLFUD datasets demonstrate that LayoutPointer significantly outperforms existing state-of-the-art methods in F1 scores for RE tasks (e.g., 5.71{\%} improvement on XFUND using LayoutPointer$_{\text{BASE-X}}$ over LayoutLMv3)."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="siyuan-etal-2024-layoutpointer">
<titleInfo>
<title>LayoutPointer: A Spatial-Context Adaptive Pointer Network for Visual Information Extraction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Huang</namePart>
<namePart type="family">Siyuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongping</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wu</namePart>
<namePart type="family">Guibin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual Information Extraction (VIE), as a crucial task of Document Intelligence, involves two primary sub-tasks: Semantic Entity Recognition (SER) and Relation Extraction (RE). However, VIE faces two significant challenges. Firstly, most existing models inadequately utilize spatial information of entities, often failing to predict connections or incorrectly linking spatially distant entities. Secondly, the improper input order of tokens challenges in extracting complete entity pairs from documents with multi-line entities when text is extracted via PDF parser or OCR. To address these challenges, we propose LayoutPointer, a Spatial-Context Adaptive Pointer Network. LayoutPointer explicitly enhances spatial-context relationships by incorporating 2D relative position information and adaptive spatial constraints within self-attention. Furthermore, we recast the RE task as a specialized cycle detection problem, employing a unique tail-to-head pointer to restore the semantic order among multi-line entities. To better evaluate the effectiveness of our proposed method, we reconstruct a multi-line dataset named MLFUD, which more accurately reflects real-world scenarios. Fine-tuning experimental results on FUNSD, XFUND, and MLFUD datasets demonstrate that LayoutPointer significantly outperforms existing state-of-the-art methods in F1 scores for RE tasks (e.g., 5.71% improvement on XFUND using LayoutPointer_\textBASE-X over LayoutLMv3).</abstract>
<identifier type="citekey">siyuan-etal-2024-layoutpointer</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.207</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.207/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3737</start>
<end>3748</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LayoutPointer: A Spatial-Context Adaptive Pointer Network for Visual Information Extraction
%A Siyuan, Huang
%A Xiong, Yongping
%A Guibin, Wu
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F siyuan-etal-2024-layoutpointer
%X Visual Information Extraction (VIE), as a crucial task of Document Intelligence, involves two primary sub-tasks: Semantic Entity Recognition (SER) and Relation Extraction (RE). However, VIE faces two significant challenges. Firstly, most existing models inadequately utilize spatial information of entities, often failing to predict connections or incorrectly linking spatially distant entities. Secondly, the improper input order of tokens challenges in extracting complete entity pairs from documents with multi-line entities when text is extracted via PDF parser or OCR. To address these challenges, we propose LayoutPointer, a Spatial-Context Adaptive Pointer Network. LayoutPointer explicitly enhances spatial-context relationships by incorporating 2D relative position information and adaptive spatial constraints within self-attention. Furthermore, we recast the RE task as a specialized cycle detection problem, employing a unique tail-to-head pointer to restore the semantic order among multi-line entities. To better evaluate the effectiveness of our proposed method, we reconstruct a multi-line dataset named MLFUD, which more accurately reflects real-world scenarios. Fine-tuning experimental results on FUNSD, XFUND, and MLFUD datasets demonstrate that LayoutPointer significantly outperforms existing state-of-the-art methods in F1 scores for RE tasks (e.g., 5.71% improvement on XFUND using LayoutPointer_\textBASE-X over LayoutLMv3).
%R 10.18653/v1/2024.naacl-long.207
%U https://aclanthology.org/2024.naacl-long.207/
%U https://doi.org/10.18653/v1/2024.naacl-long.207
%P 3737-3748
Markdown (Informal)
[LayoutPointer: A Spatial-Context Adaptive Pointer Network for Visual Information Extraction](https://aclanthology.org/2024.naacl-long.207/) (Siyuan et al., NAACL 2024)
ACL