@inproceedings{mehrabi-etal-2023-resolving,
title = "Resolving Ambiguities in Text-to-Image Generative Models",
author = "Mehrabi, Ninareh and
Goyal, Palash and
Verma, Apurv and
Dhamala, Jwala and
Kumar, Varun and
Hu, Qian and
Chang, Kai-Wei and
Zemel, Richard and
Galstyan, Aram and
Gupta, Rahul",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.804",
doi = "10.18653/v1/2023.acl-long.804",
pages = "14367--14388",
abstract = "Natural language often contains ambiguities that can lead to misinterpretation and miscommunication. While humans can handle ambiguities effectively by asking clarifying questions and/or relying on contextual cues and common-sense knowledge, resolving ambiguities can be notoriously hard for machines. In this work, we study ambiguities that arise in text-to-image generative models. We curate the Text-to-image Ambiguity Benchmark (TAB) dataset to study different types of ambiguities in text-to-image generative models. We then propose the Text-to-ImagE Disambiguation (TIED) framework to disambiguate the prompts given to the text-to-image generative models by soliciting clarifications from the end user. Through automatic and human evaluations, we show the effectiveness of our framework in generating more faithful images aligned with end user intention in the presence of ambiguities.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mehrabi-etal-2023-resolving">
<titleInfo>
<title>Resolving Ambiguities in Text-to-Image Generative Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Palash</namePart>
<namePart type="family">Goyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Apurv</namePart>
<namePart type="family">Verma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Varun</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qian</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Zemel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galstyan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Natural language often contains ambiguities that can lead to misinterpretation and miscommunication. While humans can handle ambiguities effectively by asking clarifying questions and/or relying on contextual cues and common-sense knowledge, resolving ambiguities can be notoriously hard for machines. In this work, we study ambiguities that arise in text-to-image generative models. We curate the Text-to-image Ambiguity Benchmark (TAB) dataset to study different types of ambiguities in text-to-image generative models. We then propose the Text-to-ImagE Disambiguation (TIED) framework to disambiguate the prompts given to the text-to-image generative models by soliciting clarifications from the end user. Through automatic and human evaluations, we show the effectiveness of our framework in generating more faithful images aligned with end user intention in the presence of ambiguities.</abstract>
<identifier type="citekey">mehrabi-etal-2023-resolving</identifier>
<identifier type="doi">10.18653/v1/2023.acl-long.804</identifier>
<location>
<url>https://aclanthology.org/2023.acl-long.804</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>14367</start>
<end>14388</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Resolving Ambiguities in Text-to-Image Generative Models
%A Mehrabi, Ninareh
%A Goyal, Palash
%A Verma, Apurv
%A Dhamala, Jwala
%A Kumar, Varun
%A Hu, Qian
%A Chang, Kai-Wei
%A Zemel, Richard
%A Galstyan, Aram
%A Gupta, Rahul
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F mehrabi-etal-2023-resolving
%X Natural language often contains ambiguities that can lead to misinterpretation and miscommunication. While humans can handle ambiguities effectively by asking clarifying questions and/or relying on contextual cues and common-sense knowledge, resolving ambiguities can be notoriously hard for machines. In this work, we study ambiguities that arise in text-to-image generative models. We curate the Text-to-image Ambiguity Benchmark (TAB) dataset to study different types of ambiguities in text-to-image generative models. We then propose the Text-to-ImagE Disambiguation (TIED) framework to disambiguate the prompts given to the text-to-image generative models by soliciting clarifications from the end user. Through automatic and human evaluations, we show the effectiveness of our framework in generating more faithful images aligned with end user intention in the presence of ambiguities.
%R 10.18653/v1/2023.acl-long.804
%U https://aclanthology.org/2023.acl-long.804
%U https://doi.org/10.18653/v1/2023.acl-long.804
%P 14367-14388
Markdown (Informal)
[Resolving Ambiguities in Text-to-Image Generative Models](https://aclanthology.org/2023.acl-long.804) (Mehrabi et al., ACL 2023)
ACL
- Ninareh Mehrabi, Palash Goyal, Apurv Verma, Jwala Dhamala, Varun Kumar, Qian Hu, Kai-Wei Chang, Richard Zemel, Aram Galstyan, and Rahul Gupta. 2023. Resolving Ambiguities in Text-to-Image Generative Models. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 14367–14388, Toronto, Canada. Association for Computational Linguistics.