@inproceedings{asakura-miyao-2024-needed,
title = "What Is Needed for Intra-document Disambiguation of Math Identifiers?",
author = "Asakura, Takuto and
Miyao, Yusuke",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1522/",
pages = "17500--17512",
abstract = "In automated scientific document analysis, accurately interpreting math formulae is imperative alongside comprehending natural language. Ambiguity in math identifiers within a single document poses significant challenges to understanding math formulae. While disambiguating math identifiers across documents has seen some progress, resolving ambiguity within a document remains inadequately researched due to complexity and insufficient datasets. The level of difficulty and information required to accomplish this task was uncertain. This study aims to determine which information is necessary for the intra-document disambiguation of math identifiers. Our findings indicate that the position data and local formula structure surrounding the identifiers, including modifiers, are particularly critical. For our study, we expanded a dataset for formula grounding and doubled its size to include annotations for 27,655 math identifier occurrences. We have created a multi-layer perceptron model that performs similarly to humans, with an 85{\%} accuracy and a kappa value of 0.73, outperforming rule-based baselines. We trained and evaluated the model with papers in natural language processing (NLP). Our findings were also confirmed valid in fields other than NLP by applying the trained models to papers from various fields. These results will aid in improving mathematical language processing, such as mathematical information retrieval."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="asakura-miyao-2024-needed">
<titleInfo>
<title>What Is Needed for Intra-document Disambiguation of Math Identifiers?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Takuto</namePart>
<namePart type="family">Asakura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Miyao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In automated scientific document analysis, accurately interpreting math formulae is imperative alongside comprehending natural language. Ambiguity in math identifiers within a single document poses significant challenges to understanding math formulae. While disambiguating math identifiers across documents has seen some progress, resolving ambiguity within a document remains inadequately researched due to complexity and insufficient datasets. The level of difficulty and information required to accomplish this task was uncertain. This study aims to determine which information is necessary for the intra-document disambiguation of math identifiers. Our findings indicate that the position data and local formula structure surrounding the identifiers, including modifiers, are particularly critical. For our study, we expanded a dataset for formula grounding and doubled its size to include annotations for 27,655 math identifier occurrences. We have created a multi-layer perceptron model that performs similarly to humans, with an 85% accuracy and a kappa value of 0.73, outperforming rule-based baselines. We trained and evaluated the model with papers in natural language processing (NLP). Our findings were also confirmed valid in fields other than NLP by applying the trained models to papers from various fields. These results will aid in improving mathematical language processing, such as mathematical information retrieval.</abstract>
<identifier type="citekey">asakura-miyao-2024-needed</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1522/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>17500</start>
<end>17512</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What Is Needed for Intra-document Disambiguation of Math Identifiers?
%A Asakura, Takuto
%A Miyao, Yusuke
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F asakura-miyao-2024-needed
%X In automated scientific document analysis, accurately interpreting math formulae is imperative alongside comprehending natural language. Ambiguity in math identifiers within a single document poses significant challenges to understanding math formulae. While disambiguating math identifiers across documents has seen some progress, resolving ambiguity within a document remains inadequately researched due to complexity and insufficient datasets. The level of difficulty and information required to accomplish this task was uncertain. This study aims to determine which information is necessary for the intra-document disambiguation of math identifiers. Our findings indicate that the position data and local formula structure surrounding the identifiers, including modifiers, are particularly critical. For our study, we expanded a dataset for formula grounding and doubled its size to include annotations for 27,655 math identifier occurrences. We have created a multi-layer perceptron model that performs similarly to humans, with an 85% accuracy and a kappa value of 0.73, outperforming rule-based baselines. We trained and evaluated the model with papers in natural language processing (NLP). Our findings were also confirmed valid in fields other than NLP by applying the trained models to papers from various fields. These results will aid in improving mathematical language processing, such as mathematical information retrieval.
%U https://aclanthology.org/2024.lrec-main.1522/
%P 17500-17512
Markdown (Informal)
[What Is Needed for Intra-document Disambiguation of Math Identifiers?](https://aclanthology.org/2024.lrec-main.1522/) (Asakura & Miyao, LREC-COLING 2024)
ACL