@inproceedings{santos-etal-2022-cost,
title = "Cost-Effective Language Driven Image Editing with {LX}-{DRIM}",
author = "Santos, Rodrigo and
Branco, Ant{\'o}nio and
Silva, Jo{\~a}o Ricardo",
booktitle = "Proceedings of the First Workshop on Performance and Interpretability Evaluations of Multimodal, Multipurpose, Massive-Scale Models",
month = oct,
year = "2022",
address = "Virtual",
publisher = "International Conference on Computational Linguistics",
url = "https://aclanthology.org/2022.mmmpie-1.5",
pages = "31--43",
abstract = "Cross-modal language and image processing is envisaged as a way to improve language understanding by resorting to visual grounding, but only recently, with the emergence of neural architectures specifically tailored to cope with both modalities, has it attracted increased attention and obtained promising results. In this paper we address a cross-modal task of language-driven image design, in particular the task of altering a given image on the basis of language instructions. We also avoid the need for a specifically tailored architecture and resort instead to a general purpose model in the Transformer family. Experiments with the resulting tool, LX-DRIM, show very encouraging results, confirming the viability of the approach for language-driven image design while keeping it affordable in terms of compute and data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="santos-etal-2022-cost">
<titleInfo>
<title>Cost-Effective Language Driven Image Editing with LX-DRIM</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rodrigo</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">António</namePart>
<namePart type="family">Branco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="given">Ricardo</namePart>
<namePart type="family">Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Performance and Interpretability Evaluations of Multimodal, Multipurpose, Massive-Scale Models</title>
</titleInfo>
<originInfo>
<publisher>International Conference on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Cross-modal language and image processing is envisaged as a way to improve language understanding by resorting to visual grounding, but only recently, with the emergence of neural architectures specifically tailored to cope with both modalities, has it attracted increased attention and obtained promising results. In this paper we address a cross-modal task of language-driven image design, in particular the task of altering a given image on the basis of language instructions. We also avoid the need for a specifically tailored architecture and resort instead to a general purpose model in the Transformer family. Experiments with the resulting tool, LX-DRIM, show very encouraging results, confirming the viability of the approach for language-driven image design while keeping it affordable in terms of compute and data.</abstract>
<identifier type="citekey">santos-etal-2022-cost</identifier>
<location>
<url>https://aclanthology.org/2022.mmmpie-1.5</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>31</start>
<end>43</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cost-Effective Language Driven Image Editing with LX-DRIM
%A Santos, Rodrigo
%A Branco, António
%A Silva, João Ricardo
%S Proceedings of the First Workshop on Performance and Interpretability Evaluations of Multimodal, Multipurpose, Massive-Scale Models
%D 2022
%8 October
%I International Conference on Computational Linguistics
%C Virtual
%F santos-etal-2022-cost
%X Cross-modal language and image processing is envisaged as a way to improve language understanding by resorting to visual grounding, but only recently, with the emergence of neural architectures specifically tailored to cope with both modalities, has it attracted increased attention and obtained promising results. In this paper we address a cross-modal task of language-driven image design, in particular the task of altering a given image on the basis of language instructions. We also avoid the need for a specifically tailored architecture and resort instead to a general purpose model in the Transformer family. Experiments with the resulting tool, LX-DRIM, show very encouraging results, confirming the viability of the approach for language-driven image design while keeping it affordable in terms of compute and data.
%U https://aclanthology.org/2022.mmmpie-1.5
%P 31-43
Markdown (Informal)
[Cost-Effective Language Driven Image Editing with LX-DRIM](https://aclanthology.org/2022.mmmpie-1.5) (Santos et al., MMMPIE 2022)
ACL
- Rodrigo Santos, António Branco, and João Ricardo Silva. 2022. Cost-Effective Language Driven Image Editing with LX-DRIM. In Proceedings of the First Workshop on Performance and Interpretability Evaluations of Multimodal, Multipurpose, Massive-Scale Models, pages 31–43, Virtual. International Conference on Computational Linguistics.