@inproceedings{bonn-etal-2024-adjudicating,
title = "Adjudicating {LLM}s as {P}rop{B}ank Adjudicators",
author = "Bonn, Julia and
Tayyar Madabushi, Harish and
Hwang, Jena D. and
Bonial, Claire",
editor = "Bonial, Claire and
Bonn, Julia and
Hwang, Jena D.",
booktitle = "Proceedings of the Fifth International Workshop on Designing Meaning Representations @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.dmr-1.12",
pages = "112--123",
abstract = "We evaluate the ability of large language models (LLMs) to provide PropBank semantic role label annotations across different realizations of the same verbs in transitive, intransitive, and middle voice constructions. In order to assess the meta-linguistic capabilities of LLMs as well as their ability to glean such capabilities through in-context learning, we evaluate the models in a zero-shot setting, in a setting where it is given three examples of another verb used in transitive, intransitive, and middle voice constructions, and finally in a setting where it is given the examples as well as the correct sense and roleset information. We find that zero-shot knowledge of PropBank annotation is almost nonexistent. The largest model evaluated, GPT-4, achieves the best performance in the setting where it is given both examples and the correct roleset in the prompt, demonstrating that larger models can ascertain some meta-linguistic capabilities through in-context learning. However, even in this setting, which is simpler than the task of a human in PropBank annotation, the model achieves only 48{\%} accuracy in marking numbered arguments correctly. To ensure transparency and reproducibility, we publicly release our dataset and model responses.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bonn-etal-2024-adjudicating">
<titleInfo>
<title>Adjudicating LLMs as PropBank Adjudicators</title>
</titleInfo>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Bonn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harish</namePart>
<namePart type="family">Tayyar Madabushi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jena</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Hwang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth International Workshop on Designing Meaning Representations @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Bonn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jena</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Hwang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We evaluate the ability of large language models (LLMs) to provide PropBank semantic role label annotations across different realizations of the same verbs in transitive, intransitive, and middle voice constructions. In order to assess the meta-linguistic capabilities of LLMs as well as their ability to glean such capabilities through in-context learning, we evaluate the models in a zero-shot setting, in a setting where it is given three examples of another verb used in transitive, intransitive, and middle voice constructions, and finally in a setting where it is given the examples as well as the correct sense and roleset information. We find that zero-shot knowledge of PropBank annotation is almost nonexistent. The largest model evaluated, GPT-4, achieves the best performance in the setting where it is given both examples and the correct roleset in the prompt, demonstrating that larger models can ascertain some meta-linguistic capabilities through in-context learning. However, even in this setting, which is simpler than the task of a human in PropBank annotation, the model achieves only 48% accuracy in marking numbered arguments correctly. To ensure transparency and reproducibility, we publicly release our dataset and model responses.</abstract>
<identifier type="citekey">bonn-etal-2024-adjudicating</identifier>
<location>
<url>https://aclanthology.org/2024.dmr-1.12</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>112</start>
<end>123</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adjudicating LLMs as PropBank Adjudicators
%A Bonn, Julia
%A Tayyar Madabushi, Harish
%A Hwang, Jena D.
%A Bonial, Claire
%Y Bonial, Claire
%Y Bonn, Julia
%Y Hwang, Jena D.
%S Proceedings of the Fifth International Workshop on Designing Meaning Representations @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F bonn-etal-2024-adjudicating
%X We evaluate the ability of large language models (LLMs) to provide PropBank semantic role label annotations across different realizations of the same verbs in transitive, intransitive, and middle voice constructions. In order to assess the meta-linguistic capabilities of LLMs as well as their ability to glean such capabilities through in-context learning, we evaluate the models in a zero-shot setting, in a setting where it is given three examples of another verb used in transitive, intransitive, and middle voice constructions, and finally in a setting where it is given the examples as well as the correct sense and roleset information. We find that zero-shot knowledge of PropBank annotation is almost nonexistent. The largest model evaluated, GPT-4, achieves the best performance in the setting where it is given both examples and the correct roleset in the prompt, demonstrating that larger models can ascertain some meta-linguistic capabilities through in-context learning. However, even in this setting, which is simpler than the task of a human in PropBank annotation, the model achieves only 48% accuracy in marking numbered arguments correctly. To ensure transparency and reproducibility, we publicly release our dataset and model responses.
%U https://aclanthology.org/2024.dmr-1.12
%P 112-123
Markdown (Informal)
[Adjudicating LLMs as PropBank Adjudicators](https://aclanthology.org/2024.dmr-1.12) (Bonn et al., DMR-WS 2024)
ACL
- Julia Bonn, Harish Tayyar Madabushi, Jena D. Hwang, and Claire Bonial. 2024. Adjudicating LLMs as PropBank Adjudicators. In Proceedings of the Fifth International Workshop on Designing Meaning Representations @ LREC-COLING 2024, pages 112–123, Torino, Italia. ELRA and ICCL.