iccv 2025 submission

This commit is contained in:
Tobias Christian Nauen
2026-02-24 12:08:38 +01:00
parent 5c08f9d31a
commit 78765791be
353 changed files with 3269 additions and 6604 deletions

View File

@@ -3,69 +3,58 @@
\section{Introduction}
\label{sec:intro}
% \begin{figure}
% \centering
% \includegraphics[width=.5\columnwidth]{img/fig-1.pdf}
% \caption{\schemename factorizes each training image into a foreground object and a background, then recombines them on the fly while controlling background identity, object position, and object scale. Standard, strong augmentations are applied afterwards.}
% \label{fig:fig-1}
% \end{figure}
\begin{table}[t]
\caption{Examples of \schemename generated images (center cropped) from ImageNet.
We successfully segment even multiple objects (\textit{Macaw}) and complex shapes (\textit{Cricket}).}
\label{tab:foraug-examples}
\centering
\resizebox{.9\textwidth}{!}{
\begin{tabular}{ccccc}
\toprule
Class & \makecell{Original \\Image} & \makecell{Extracted \\Foreground} & \makecell{Infilled \\Background} & Recombined Examples \\
\midrule
Macaw & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507_v1_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507_v1_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v12.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v18.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v3.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v4.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v6.JPEG} \\
% Conch & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v9.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v10.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v11.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v12.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v17.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v8.JPEG} \\
Cricket & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v0.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v10.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v16.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v2.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v6.JPEG} \\
Laptop & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v0.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v1.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v11.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v14.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v2.JPEG} \\
\bottomrule
\end{tabular}
}
\end{table}
% \begin{itemize}
% \item General Into Image classification
% \item ImageNet
% \item CNNs $\to$ Transformers
% \item Traditional Data Augmentation: CNNs
% \item Problems with that: Other model properties of Transformers
% \item Our approach: Recombining ImageNet forgrounds and backgrounds
% \end{itemize}
\begin{figure}
\centering
\includegraphics[width=\columnwidth]{img/fig-1.pdf}
\caption{Comparison of \name and ImageNet. \name recombines foreground objects with different backgrounds each epoch, thus creating a more diverse training set. We still apply traditional data augmentation afterwards.}
\label{fig:fig-1}
\end{figure}
Large-scale image classification is a central driver of modern computer vision: it benchmarks progress in computer vision~\cite{Khan2022,Rangel2024}, powers model pretraining~\cite{Dosovitskiy2021,Liu2021,Touvron2021b}, and yields representations that transfer broadly and underpin applications like medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2023a}, and object recognition~\cite{Carion2020,He2017,Girshick2014}.
However, classification supervision is weak in an important sense: the label does not specify \emph{how} the class-object should appear.
In ImageNet~\cite{Deng2009} for example, objects often occur at characteristic positions and scales and co-occur with correlated scene context~\cite{Fatima2025,Barbu2019}.
% In datasets such as ImageNet, objects often occur at characteristic positions and scales and co-occur with correlated scene context~\cite{Fatima2025,Barbu2019}.
As a result, models rely on shortcuts like background cues, center bias, or size bias, that boost in-distribution accuracy but hurt robustness and transfer~\cite{Geirhos2020,Fatima2025,Barbu2019}.
Image classification, a fundamental task in computer vision (CV), involves assigning a label to an image from a predefined set of categories.
This seemingly simple task underpins a wide range of applications, including medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2022b}, and object recognition~\cite{Carion2020,He2017,Girshick2013}.
Furthermore, image classification is used for large-scale pretraining of vision models~\cite{Dosovitskiy2021,Liu2021,Touvron2021b} and to judge the progress of the field of CV \cite{Khan2022, Rangel2024}.
The advent of large-scale datasets, particularly ImageNet \cite{Deng2009}, containing millions of labeled images across thousands of categories, has been instrumental in driving significant progress in this field.
ImageNet served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}.
% It is used to train and evaluate the best models in the field.
Here, data augmentation is the default defense.
Standard transformations (crop/flip/color jitter) and stronger policies such as MixUp~\cite{Zhang2018a}/CutMix~\cite{Yun2019} and automated augmentation search~\cite{Cubuk2019,Cubuk2020} expand appearance diversity~\cite{Shorten2019,Xu2023d}. % , yet they largely preserve the original \emph{composition} of each image~\cite{Shorten2019,Xu2023d}.
However, their ability to teach spatial and compositional invariances is limited.
This constraint matters especially for Vision Transformers (ViTs)~\cite{Dosovitskiy2021}: with weaker built-in spatial inductive biases than Convolutional Neural Networks (CNNs), ViTs must learn key equivariances (e.g., translation and scale robustness) primarily from data.
Copy-paste style augmentations~\cite{Ghiasi2021,Kang2022} alter composition more aggressively by overlaying segmented objects onto other images.
These are typically designed for detection or instance segmentation and rely on dense human annotations available for these tasks or use unconstrained dataset images as backgrounds.
As a result, they do not offer fine-grained control of object position and scale, and they do not explicitly enforce that the pasted background is semantically neutral, creating ambiguous labels for classification.
While traditionally, convolutional neural networks (CNNs) have been the go-to architecture for image classification, Transformers \cite{Vaswani2017}, particularly the Vision Transformer (ViT) \cite{Dosovitskiy2021}, have emerged as a powerful alternative.
These attention-based models have demonstrated superior performance in various vision tasks, including image classification \cite{Wortsman2022,Yu2022,Carion2020,Zong2022,Wang2022a}.
To encode compositional invariances directly in the training data, we propose \emph{Foreground-Background Augmentation} (\schemename), a controlled composition augmentation that \emph{explicitly factorizes each image into foreground and background, then recombines them for label-preserving, interpretable distribution shifts}.
Concretely, \schemename uses off-the-shelf segmentation and inpainting models to (i) extract a foreground object and synthesize a class-consistent, semantically neutral background, and (ii) paste the foreground onto diverse neutral backgrounds while controlling its position and scale (see \Cref{tab:foraug-examples}).
Unlike prior copy-paste methods that simply overlay objects onto arbitrary scenes~\cite{Ghiasi2021,Ghiasi2021,Kang2022}, \schemename first removes and neutralizes the original background, then samples from well-defined distributions of backgrounds, object positions, and object sizes.
This explicit factorization preserves a clean label for the recombined image while providing direct control over compositions, enabling us to break spurious correlations while still fitting seamlessly into modern strong augmentation pipelines. % (see \Cref{fig:fig-1}).
% Throughout, we apply \schemename on top of strong augmentation pipelines (RandAugment, Mixup, CutMix), so any gains are complementary to these widely used techniques.
% As it is important that any gains are complementary to strong augmentation pipelines (RandAugment, MixUp, CutMix), we apply \schemename on top of these widely used techniques.
To ensure that all gains are complementary to strong augmentation pipelines (RandAugment, MixUp, CutMix), we apply \schemename on top of these widely used techniques.
Data augmentation is a key technique for training image classification models.
% A key technique for training image classification models, especially with limited data, is data augmentation.
Traditional data augmentation methods, such as random cropping, flipping, and color jittering, are commonly employed to increase the diversity of the training data and improve the model's performance~\cite{Xu2023d, Shorten2019}.
These basic transformations, originally designed for CNNs, change the input images in a way that preserves their semantic meaning~\cite{Alomar2023}.
However, the architectural differences of CNNs and Transformers suggest that the latter might benefit from different data augmentation strategies.
In particular, the Transformers self-attention mechanism is not translation equivariant~\cite{RojasGomez2023,Ding2023a}, meaning that the model does not inherently understand the spatial relationships between pixels.
% This creates the need for novel data augmentation strategies tailored to the Transformer architecture.
% This fact opens a new design space for data augmentation strategies to help Transformers understand the basic invariances of image classification.
Empirically, \schemename yields consistent accuracy gains across architectures, improving ImageNet top-1 accuracy by up to 6 p.p. and fine-grained downstream accuracy by up to 7.3 p.p., and even improving transfer when ImageNet accuracy is matched.
Beyond accuracy, training with \schemename substantially improves robustness on standard distribution-shift benchmarks, where we observe gains of roughly $2-19$ p.p. across ViT, Swin, and ResNet architectures.
Inspired by this inductive bias of CNNs, that is not inherent to ViTs, we propose \schemename, a novel data augmentation scheme for image classification which makes the translation equivariance of CNNs explicit in the training data by recombining foreground objects at varying positions with different backgrounds.
% In this paper, we address the challenge of effectively training Transformers for image classification by proposing \schemename, a novel data augmentation scheme for image classification, which combines foreground objects with different backgrounds.
Applying \schemename to ImageNet gives rise to \name, a novel dataset that enables this data augmentation with with fine-grained control over the image composition.
Recognizing that Transformers need to learn the spatial relationships from data, since they are not inherently translation invariant, and in general are usually trained on larger datasets~\cite{Kolesnikov2020}, we separate the foreground objects in ImageNet from their backgrounds, using an open-world object detector~\cite{Ren2024}, and fill in the background in a plausible way using an object removal model~\cite{Sun2024,Suvorov2021}.
This allows us to recombine any foreground object with any background on the fly, creating a highly diverse training set.
During recombination, we can control important parameters, like the size and position of the foreground object, to help the model learn the spatial invariances necessary for image classification.
We show that training on \name instead of ImageNet increases the model accuracy of Transformers by up to 4.5 p.p. on ImageNet and an up to $39.3\%$ reduction in error rate on downstream tasks.
Finally, the same control knobs enable \schemename to become a targeted diagnostic tool of shortcut reliance and model robustness.
We quantify background reliance via controlled background swaps, and probe center and size biases through systematic position and scale sweeps, showing that training with \schemename reduces model biases.
Additionally, \schemename is a useful tool for analyzing model behavior and biases, when used during the evaluation phase.
We utilize our control over the image distribution to quantify a model's background robustness (by varying the choice of background), foreground focus (by leveraging our knowledge about the placement of the foreground object), center bias (by controlling the object's position), and size bias (by controlling object size).
These analyses provide insights into model behavior and biases, which is crucial for model deployment and future robustness optimizations.
We show that training on \name, instead of ImageNet, significantly reduces all of these biases, completely removing the models' dependence on the background distribution.
We make our code for \schemename and the \name-dataset publicly available\footnote{Link will go here.} to facilitate further research.
\medskip
\noindent
\textbf{Contributions}
\begin{itemize}[topsep=0pt]
\item \textbf{Controlled composition augmentation for classification.}
We introduce \schemename, a foreground-background factorization and recombination scheme for image classification that creates label-preserving training samples with explicit control over background identity, object position, and object scale.
\item \textbf{Accuracy and transfer gains.}
Training with \schemename, in addition to standard strong augmentation pipelines, improves ImageNet top-1 accuracy by up to 6 p.p., boosts fine-grained downstream accuracy by up to 7.3 p.p. and increases accuracy on shifted distributions by up to $19$ p.p.
\item \textbf{Controlled bias diagnostics and mitigation.}
Using the same controls during evaluation, we measure background reliance, foreground focus, and position/scale biases through targeted distribution shifts.
\schemename systematically reduces shortcut behaviors and model biases.
\subsection*{Contributions}
\begin{itemize}
\item We propose \schemename, a novel data augmentation scheme, that recombines objects and backgrounds to train Transformers for image classification.
\item We show that training on \name, the ImageNet instantiation of \schemename, leads to 4.5 p.p. improved accuracy on ImageNet and 7.3 p.p. on downstream tasks.
\item We propose novel \schemename-based metrics to analyze and quantify fine-grained biases trained models: Background Robustness, Foreground Focus, Center Bias, and Size Bias. Training on \name, instead of ImageNet, significantly reduces these biases.
\end{itemize}