@inproceedings{9217294920734875aad6da23d83abb65,
title = "Mismatch Quest: Visual and Textual Feedback for Image-Text Misalignment",
abstract = "While existing image-text alignment models reach high quality binary assessments, they fall short of pinpointing the exact source of misalignment. In this paper, we present a method to provide detailed textual and visual explanation of detected misalignments between text-image pairs. We leverage large language models and visual grounding models to automatically construct a training set that holds plausible misaligned captions for a given image and corresponding textual explanations and visual indicators. We also publish a new human curated test set comprising ground-truth textual and visual misalignment annotations. Empirical results show that fine-tuning vision language models on our training set enables them to articulate misalignments and visually indicate them within images, outperforming strong baselines both on the binary alignment classification and the explanation generation tasks. Our code and human curated test set are available at: https://github.com/MismatchQuest/MismatchQuest.",
author = "Brian Gordon and Yonatan Bitton and Yonatan Shafir and Roopal Garg and Xi Chen and Dani Lischinski and Daniel Cohen-Or and Idan Szpektor",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Switzerland AG 2025.; 18th European Conference on Computer Vision, ECCV 2024 ; Conference date: 29-09-2024 Through 04-10-2024",
year = "2025",
doi = "10.1007/978-3-031-72998-0_18",
language = "אנגלית",
isbn = "9783031729973",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "310--328",
editor = "Ale{\v s} Leonardis and Elisa Ricci and Stefan Roth and Olga Russakovsky and Torsten Sattler and G{\"u}l Varol",
booktitle = "Computer Vision – ECCV 2024 - 18th European Conference, Proceedings",
address = "גרמניה",
}