@inproceedings{gao-etal-2022-field,
title = "Field Extraction from Forms with Unlabeled Data",
author = "Gao, Mingfei and
Chen, Zeyuan and
Naik, Nikhil and
Hashimoto, Kazuma and
Xiong, Caiming and
Xu, Ran",
editor = "Das, Rajarshi and
Lewis, Patrick and
Min, Sewon and
Thai, June and
Zaheer, Manzil",
booktitle = "Proceedings of the 1st Workshop on Semiparametric Methods in NLP: Decoupling Logic from Knowledge",
month = may,
year = "2022",
address = "Dublin, Ireland and Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.spanlp-1.4/",
doi = "10.18653/v1/2022.spanlp-1.4",
pages = "30--40",
abstract = "We propose a novel framework to conduct field extraction from forms with unlabeled data. To bootstrap the training process, we develop a rule-based method for mining noisy pseudo-labels from unlabeled forms. Using the supervisory signal from the pseudo-labels, we extract a discriminative token representation from a transformer-based model by modeling the interaction between text in the form. To prevent the model from overfitting to label noise, we introduce a refinement module based on a progressive pseudo-label ensemble. Experimental results demonstrate the effectiveness of our framework."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gao-etal-2022-field">
<titleInfo>
<title>Field Extraction from Forms with Unlabeled Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mingfei</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeyuan</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikhil</namePart>
<namePart type="family">Naik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kazuma</namePart>
<namePart type="family">Hashimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Caiming</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ran</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Semiparametric Methods in NLP: Decoupling Logic from Knowledge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rajarshi</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Lewis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sewon</namePart>
<namePart type="family">Min</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">June</namePart>
<namePart type="family">Thai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manzil</namePart>
<namePart type="family">Zaheer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland and Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose a novel framework to conduct field extraction from forms with unlabeled data. To bootstrap the training process, we develop a rule-based method for mining noisy pseudo-labels from unlabeled forms. Using the supervisory signal from the pseudo-labels, we extract a discriminative token representation from a transformer-based model by modeling the interaction between text in the form. To prevent the model from overfitting to label noise, we introduce a refinement module based on a progressive pseudo-label ensemble. Experimental results demonstrate the effectiveness of our framework.</abstract>
<identifier type="citekey">gao-etal-2022-field</identifier>
<identifier type="doi">10.18653/v1/2022.spanlp-1.4</identifier>
<location>
<url>https://aclanthology.org/2022.spanlp-1.4/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>30</start>
<end>40</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Field Extraction from Forms with Unlabeled Data
%A Gao, Mingfei
%A Chen, Zeyuan
%A Naik, Nikhil
%A Hashimoto, Kazuma
%A Xiong, Caiming
%A Xu, Ran
%Y Das, Rajarshi
%Y Lewis, Patrick
%Y Min, Sewon
%Y Thai, June
%Y Zaheer, Manzil
%S Proceedings of the 1st Workshop on Semiparametric Methods in NLP: Decoupling Logic from Knowledge
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland and Online
%F gao-etal-2022-field
%X We propose a novel framework to conduct field extraction from forms with unlabeled data. To bootstrap the training process, we develop a rule-based method for mining noisy pseudo-labels from unlabeled forms. Using the supervisory signal from the pseudo-labels, we extract a discriminative token representation from a transformer-based model by modeling the interaction between text in the form. To prevent the model from overfitting to label noise, we introduce a refinement module based on a progressive pseudo-label ensemble. Experimental results demonstrate the effectiveness of our framework.
%R 10.18653/v1/2022.spanlp-1.4
%U https://aclanthology.org/2022.spanlp-1.4/
%U https://doi.org/10.18653/v1/2022.spanlp-1.4
%P 30-40
Markdown (Informal)
[Field Extraction from Forms with Unlabeled Data](https://aclanthology.org/2022.spanlp-1.4/) (Gao et al., SpaNLP 2022)
ACL
- Mingfei Gao, Zeyuan Chen, Nikhil Naik, Kazuma Hashimoto, Caiming Xiong, and Ran Xu. 2022. Field Extraction from Forms with Unlabeled Data. In Proceedings of the 1st Workshop on Semiparametric Methods in NLP: Decoupling Logic from Knowledge, pages 30–40, Dublin, Ireland and Online. Association for Computational Linguistics.