@inproceedings{saad-falcon-etal-2024-pdftriage,
title = "{PDFT}riage: Question Answering over Long, Structured Documents",
author = "Saad-Falcon, Jon and
Barrow, Joe and
Siu, Alexa and
Nenkova, Ani and
Yoon, Seunghyun and
Rossi, Ryan A. and
Dernoncourt, Franck",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.13",
doi = "10.18653/v1/2024.emnlp-industry.13",
pages = "153--169",
abstract = "Large Language Models (LLMs) have issues with document question answering (QA) in situations where the document is unable to fit in the small context length of an LLM. To overcome this issue, most existing works focus on retrieving the relevant context from the document, representing them as plain text. However, documents such as PDFs, web pages, and presentations are naturally structured with different pages, tables, sections, and so on. Representing such structured documents as plain text is incongruous with the user{'}s mental model of these documents with rich structure. When a system has to query the document for context, this incongruity is brought to the fore, and seemingly trivial questions can trip up the QA system. To bridge this fundamental gap in handling structured documents, we propose an approach called PDFTriage that enables models to retrieve the context based on either structure or content. Our experiments demonstrate the effectiveness of the proposed PDFTriage-augmented models across several classes of questions where existing retrieval-augmented LLMs fail. To facilitate further research on this fundamental problem, we release our benchmark dataset consisting of 900+ human-generated questions over 80 structured documents from 10 different categories of question types for document QA. Our code and datasets will be released soon on Github.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="saad-falcon-etal-2024-pdftriage">
<titleInfo>
<title>PDFTriage: Question Answering over Long, Structured Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="family">Saad-Falcon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joe</namePart>
<namePart type="family">Barrow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexa</namePart>
<namePart type="family">Siu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ani</namePart>
<namePart type="family">Nenkova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seunghyun</namePart>
<namePart type="family">Yoon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Rossi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) have issues with document question answering (QA) in situations where the document is unable to fit in the small context length of an LLM. To overcome this issue, most existing works focus on retrieving the relevant context from the document, representing them as plain text. However, documents such as PDFs, web pages, and presentations are naturally structured with different pages, tables, sections, and so on. Representing such structured documents as plain text is incongruous with the user’s mental model of these documents with rich structure. When a system has to query the document for context, this incongruity is brought to the fore, and seemingly trivial questions can trip up the QA system. To bridge this fundamental gap in handling structured documents, we propose an approach called PDFTriage that enables models to retrieve the context based on either structure or content. Our experiments demonstrate the effectiveness of the proposed PDFTriage-augmented models across several classes of questions where existing retrieval-augmented LLMs fail. To facilitate further research on this fundamental problem, we release our benchmark dataset consisting of 900+ human-generated questions over 80 structured documents from 10 different categories of question types for document QA. Our code and datasets will be released soon on Github.</abstract>
<identifier type="citekey">saad-falcon-etal-2024-pdftriage</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-industry.13</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.13</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>153</start>
<end>169</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PDFTriage: Question Answering over Long, Structured Documents
%A Saad-Falcon, Jon
%A Barrow, Joe
%A Siu, Alexa
%A Nenkova, Ani
%A Yoon, Seunghyun
%A Rossi, Ryan A.
%A Dernoncourt, Franck
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F saad-falcon-etal-2024-pdftriage
%X Large Language Models (LLMs) have issues with document question answering (QA) in situations where the document is unable to fit in the small context length of an LLM. To overcome this issue, most existing works focus on retrieving the relevant context from the document, representing them as plain text. However, documents such as PDFs, web pages, and presentations are naturally structured with different pages, tables, sections, and so on. Representing such structured documents as plain text is incongruous with the user’s mental model of these documents with rich structure. When a system has to query the document for context, this incongruity is brought to the fore, and seemingly trivial questions can trip up the QA system. To bridge this fundamental gap in handling structured documents, we propose an approach called PDFTriage that enables models to retrieve the context based on either structure or content. Our experiments demonstrate the effectiveness of the proposed PDFTriage-augmented models across several classes of questions where existing retrieval-augmented LLMs fail. To facilitate further research on this fundamental problem, we release our benchmark dataset consisting of 900+ human-generated questions over 80 structured documents from 10 different categories of question types for document QA. Our code and datasets will be released soon on Github.
%R 10.18653/v1/2024.emnlp-industry.13
%U https://aclanthology.org/2024.emnlp-industry.13
%U https://doi.org/10.18653/v1/2024.emnlp-industry.13
%P 153-169
Markdown (Informal)
[PDFTriage: Question Answering over Long, Structured Documents](https://aclanthology.org/2024.emnlp-industry.13) (Saad-Falcon et al., EMNLP 2024)
ACL
- Jon Saad-Falcon, Joe Barrow, Alexa Siu, Ani Nenkova, Seunghyun Yoon, Ryan A. Rossi, and Franck Dernoncourt. 2024. PDFTriage: Question Answering over Long, Structured Documents. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 153–169, Miami, Florida, US. Association for Computational Linguistics.