@inproceedings{razzhigaev-etal-2023-kandinsky,
title = "Kandinsky: An Improved Text-to-Image Synthesis with Image Prior and Latent Diffusion",
author = "Razzhigaev, Anton and
Shakhmatov, Arseniy and
Maltseva, Anastasia and
Arkhipkin, Vladimir and
Pavlov, Igor and
Ryabov, Ilya and
Kuts, Angelina and
Panchenko, Alexander and
Kuznetsov, Andrey and
Dimitrov, Denis",
editor = "Feng, Yansong and
Lefever, Els",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-demo.25",
doi = "10.18653/v1/2023.emnlp-demo.25",
pages = "286--295",
abstract = "Text-to-image generation is a significant domain in modern computer vision and achieved substantial improvements through the evolution of generative architectures. Among these, diffusion-based models demonstrated essential quality enhancements. These models generally split into two categories: pixel-level and latent-level approaches. We present Kandinsky {--} a novel exploration of latent diffusion architecture, combining the principles of image prior models with latent diffusion techniques. The image prior model, is trained separately to map CLIP text and image embeddings. Another distinct feature of the proposed model is the modified MoVQ implementation, which serves as the image autoencoder component. Overall the designed model contains 3.3B parameters. We also deployed a user-friendly demo system that supports diverse generative modes such as text-to-image generation, image fusion, text and image fusion, image variations generation and text-guided inpainting/outpainting. Additionally we released the source code and checkpoints for Kandinsky models. Experimental evaluations demonstrate FID score of 8.03 on the COCO-30K dataset, marking our model as the top open source performer in terms of measurable image generation quality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="razzhigaev-etal-2023-kandinsky">
<titleInfo>
<title>Kandinsky: An Improved Text-to-Image Synthesis with Image Prior and Latent Diffusion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Razzhigaev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arseniy</namePart>
<namePart type="family">Shakhmatov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Maltseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladimir</namePart>
<namePart type="family">Arkhipkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Igor</namePart>
<namePart type="family">Pavlov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilya</namePart>
<namePart type="family">Ryabov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angelina</namePart>
<namePart type="family">Kuts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Panchenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrey</namePart>
<namePart type="family">Kuznetsov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Denis</namePart>
<namePart type="family">Dimitrov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yansong</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Els</namePart>
<namePart type="family">Lefever</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text-to-image generation is a significant domain in modern computer vision and achieved substantial improvements through the evolution of generative architectures. Among these, diffusion-based models demonstrated essential quality enhancements. These models generally split into two categories: pixel-level and latent-level approaches. We present Kandinsky – a novel exploration of latent diffusion architecture, combining the principles of image prior models with latent diffusion techniques. The image prior model, is trained separately to map CLIP text and image embeddings. Another distinct feature of the proposed model is the modified MoVQ implementation, which serves as the image autoencoder component. Overall the designed model contains 3.3B parameters. We also deployed a user-friendly demo system that supports diverse generative modes such as text-to-image generation, image fusion, text and image fusion, image variations generation and text-guided inpainting/outpainting. Additionally we released the source code and checkpoints for Kandinsky models. Experimental evaluations demonstrate FID score of 8.03 on the COCO-30K dataset, marking our model as the top open source performer in terms of measurable image generation quality.</abstract>
<identifier type="citekey">razzhigaev-etal-2023-kandinsky</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-demo.25</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-demo.25</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>286</start>
<end>295</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Kandinsky: An Improved Text-to-Image Synthesis with Image Prior and Latent Diffusion
%A Razzhigaev, Anton
%A Shakhmatov, Arseniy
%A Maltseva, Anastasia
%A Arkhipkin, Vladimir
%A Pavlov, Igor
%A Ryabov, Ilya
%A Kuts, Angelina
%A Panchenko, Alexander
%A Kuznetsov, Andrey
%A Dimitrov, Denis
%Y Feng, Yansong
%Y Lefever, Els
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F razzhigaev-etal-2023-kandinsky
%X Text-to-image generation is a significant domain in modern computer vision and achieved substantial improvements through the evolution of generative architectures. Among these, diffusion-based models demonstrated essential quality enhancements. These models generally split into two categories: pixel-level and latent-level approaches. We present Kandinsky – a novel exploration of latent diffusion architecture, combining the principles of image prior models with latent diffusion techniques. The image prior model, is trained separately to map CLIP text and image embeddings. Another distinct feature of the proposed model is the modified MoVQ implementation, which serves as the image autoencoder component. Overall the designed model contains 3.3B parameters. We also deployed a user-friendly demo system that supports diverse generative modes such as text-to-image generation, image fusion, text and image fusion, image variations generation and text-guided inpainting/outpainting. Additionally we released the source code and checkpoints for Kandinsky models. Experimental evaluations demonstrate FID score of 8.03 on the COCO-30K dataset, marking our model as the top open source performer in terms of measurable image generation quality.
%R 10.18653/v1/2023.emnlp-demo.25
%U https://aclanthology.org/2023.emnlp-demo.25
%U https://doi.org/10.18653/v1/2023.emnlp-demo.25
%P 286-295
Markdown (Informal)
[Kandinsky: An Improved Text-to-Image Synthesis with Image Prior and Latent Diffusion](https://aclanthology.org/2023.emnlp-demo.25) (Razzhigaev et al., EMNLP 2023)
ACL
- Anton Razzhigaev, Arseniy Shakhmatov, Anastasia Maltseva, Vladimir Arkhipkin, Igor Pavlov, Ilya Ryabov, Angelina Kuts, Alexander Panchenko, Andrey Kuznetsov, and Denis Dimitrov. 2023. Kandinsky: An Improved Text-to-Image Synthesis with Image Prior and Latent Diffusion. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 286–295, Singapore. Association for Computational Linguistics.