This commit is contained in:
Tobias Christian Nauen
2026-02-24 11:57:25 +01:00
parent 7e66c96a60
commit e8cc0ee8a6
275 changed files with 16336 additions and 836 deletions

View File

@@ -2,14 +2,18 @@
\begin{abstract}
Transformers, particularly Vision Transformers (ViTs), have achieved state-of-the-art performance in large-scale image classification.
However, they often require large amounts of data and can exhibit biases that limit their robustness and generalizability.
This paper introduces \schemename, a novel data augmentation scheme that addresses these challenges and explicitly includes inductive biases, which commonly are part of the neural network architecture, into the training data.
However, they often require large amounts of data and can exhibit biases, such as center or size bias, that limit their robustness and generalizability.
This paper introduces \schemename, a novel data augmentation operation that addresses these challenges by explicitly imposing invariances into the training data, which are otherwise part of the neural network architecture.
% This paper introduces \name, a novel dataset derived from ImageNet that addresses these challenges.
\schemename is constructed by using pretrained foundation models to separate and recombine foreground objects with different backgrounds, enabling fine-grained control over image composition during training.
It thus increases the data diversity and effective number of training samples.
We demonstrate that training on \name, the application of \schemename to ImageNet, significantly improves the accuracy of ViTs and other architectures by up to 4.5 percentage points (p.p.) on ImageNet and 7.3 p.p. on downstream tasks.
Importantly, \schemename enables novel ways of analyzing model behavior and quantifying biases.
Namely, we introduce metrics for background robustness, foreground focus, center bias, and size bias and show that training on \name substantially reduces these biases compared to training on ImageNet.
\schemename is constructed by using pretrained foundation models to separate and recombine foreground objects with different backgrounds.
% enabling fine-grained control over image composition during training.
% Missing sentence here of how you use it to generate data in what way and with what purpose wrt to bias
This recombination step enables us to take fine-grained control over object position and size, as well as background selection.
% It thus increases the data diversity and effective number of training samples.
We demonstrate that using \schemename significantly improves the accuracy of ViTs and other architectures by up to 4.5 percentage points (p.p.) on ImageNet, which translates to 7.3 p.p. on downstream tasks.
% Importantly, \schemename enables novel ways of analyzing model behavior and quantifying biases.
Importantly, \schemename not only improves accuracy but also opens new ways to analyze model behavior and quantify biases.
Namely, we introduce metrics for background robustness, foreground focus, center bias, and size bias and show that using \schemename during training substantially reduces these biases.
In summary, \schemename provides a valuable tool for analyzing and mitigating biases, enabling the development of more robust and reliable computer vision models.
Our code and dataset are publicly available at \url{https://github.com/tobna/ForAug}.
Our code and dataset are publicly available at \code{https://github.com/tobna/ForAug}.
\end{abstract}

View File

@@ -2,5 +2,7 @@
\subsection*{Acknowledgements}
\label{sec:acknowledgements}
This work was funded by the Carl-Zeiss Foundation under the Sustainable Embedded AI project (P2021-02-009) and by the EU project SustainML (Horizon Europe grant agreement No 101070408).
All compute was done thanks to the Pegasus cluster at DFKI.
% Will be in the final paper.
This work was funded by the Carl-Zeiss Foundation under the Sustainable Embedded AI project (P2021-02-009). by the EU project SustainML (Horizon Europe grant agreement No 101070408) and by the BMFTR project Albatross (funding code 16IW24002).
All compute was done thanks to the Pegasus cluster at DFKI Kaiserslautern.

View File

@@ -1,57 +1,154 @@
% !TeX root = ../supplementary.tex
% !TeX root = ../main.tex
\section{Extended Bates Distribution}
\begin{figure}[h!]
\centering
\includegraphics[width=.5\columnwidth]{img/bates.pdf}
\caption{Plot of the probability distribution function (PDF) of the extended Bates distribution for different parameters $\eta$. Higher values of $\eta$ concentrate the distribution around the center.}
\label{fig:bates-pdf}
\end{figure}
% Finally, we analyze the foreground object's positioning in the image.
% We utilize an extended Bates distribution to sample the position of the foreground object.
% The Bates distribution~\cite{Bates1955} with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}.
% Therefore, the larger $\eta$, the more concentrated the distribution is around the center.
% We extend this concept to $\eta \leq -1$ by shifting the distribution away from the center and towards the edges.
% We extend this concept to $\eta \leq -1$ by defining
% \begin{align*}
% X \sim \text{Bates}(\eta) :\Leftrightarrow s(X) \sim \text{Bates}(-\eta)
% \end{align*}
% for $\eta \leq 1$ with $s$ being the sawtooth function on $[0, 1]$:
% \begin{align}
% s(x) = \begin{cases}
% x + 0.5 & \text{if } 0 < x < 0.5 \\
% x - 0.5 & \text{if } 0.5 \leq x \leq 1
% \end{cases}
% \end{align}
% Note that $s \circ s = \id$ on $[0, 1]$.
% This way, distributions with $\eta \leq -1$ are more concentrated around the borders.
% $\eta = 1$ and $\eta = -1$ both correspond to the uniform distribution.
% The PDF of this extended Bates distribution is visualized in \Cref{fig:bates-pdf}.
We introduce an extension of the Bates distribution~\cite{Bates1955} to include negative parameters, enabling sampling of foreground object positions away from the image center.
The standard Bates distribution, for $\eta \in \N$, is defined as the mean of $\eta$ independent random variables drawn from a uniform distribution \cite{Jonhson1995}.
A larger $\eta$ value increases the concentration of samples around the distribution's mean, which in this case is the image center.
To achieve an opposite effect--concentrating samples at the image borders--we extend the distribution to $\eta \leq 1$.
\begin{align*}
X \sim \text{Bates}(\eta) :\Leftrightarrow s(X) \sim \text{Bates}(-\eta)
\end{align*}
This is accomplished by sampling from a standard Bates distribution with parameter $-\eta \geq 1$ and then applying a sawtooth function.
The sawtooth function on the interval $[0,1]$ is defined as
\begin{align}
s(x) = \begin{cases}
x + 0.5 & \text{if } 0 < x < 0.5 \\
x - 0.5 & \text{if } 0.5 \leq x \leq 1
\end{cases}
\end{align}
This function effectively maps the central portion of the interval to the edges and the edge portions to the center.
For example, a value of 0.3 (central-left) is mapped to 0.8 (edge-right), while 0.8 (edge-right) is mapped to 0.3 (central-left).
This transformation inverts the distribution's concentration, shifting the probability mass from the center to the borders.
We visualize the distribution function of the extended Bates distribution in \Cref{fig:bates-pdf}.
Both $\eta = 1$ and $\eta = -1$ result in a uniform distribution across the image.
\section{Resource Usage of \schemename}
To utilize the proposed \schemename, specific computational resources are necessary, particularly for computing and storing for the output of the segmentation stage and for on-the-fly processing of the recombination stage.
\paragraph{Segmentation.}
% While calculating the segmentations and infills takes a lot of compute, this is effort that has to be spent only once per dataset.
\schemename involves a computationally expensive segmentation and infill stage, which is a one-time calculation per dataset.
Once computed, the segmentation and infill results can be perpetually reused, amortizing the initial cost over all subsequent experiments and applications.
On NVIDIA H100 GPUs, the segmentation stage will compute at a rate of $374.3 \frac{\text{img}}{\text{GPU} \times \text{h}}$ when using Attentive Eraser or $5 338.6 \frac{\text{img}}{\text{GPU} \times \text{h}}$ for LaMa.
For ImageNet this comes down to just under 9 days (Attentive Eraser) or 16 hours (LaMa) on two 8 GPU nodes.
To facilitate immediate use and reproduction of results, we publicly provide the precalculated segmentation stage output for the ImageNet dataset for download\footnote{Link will go here.}.
The output of \schemename's segmentation step on ImageNet dataset requires 73 GB of additional disk space for the segmentation output, which is separate from the base 147 GB ImageNet size.
\paragraph{Recombination.}
The recombination step of \schemename is implemented as a based data loader operation.
It's thus offloaded to the CPU, where it can be heavily parallelized and thus only results in a very minor increase in the training step-time.
For example, using a ViT-B model on an NVIDIA A100 GPU, the average update step-time increased by $1\%$, from $528 \pm 2$ ms to $534 \pm 1$ ms.
\section{Training Setup}
\label{sec:training_setup}
\begin{table}[h]
\begin{table*}[h!]
\centering
\begin{tabular}{lc}
\toprule
Parameter & Value \\
\midrule
Image Resolution & $224 \times 224$ \\
Epochs & 300 \\
Learning Rate & 3e-3 \\
Learning Rate Schedule & cosine decay \\
Batch Size & 2048 \\
Warmup Schedule & linear \\
Warmup Epochs & 3 \\
Weight Decay & 0.02 \\
Label Smoothing & 0.1 \\
Optimizer & Lamb \cite{You2020} \\
Data Augmentation Policy & 3-Augment \cite{Touvron2022} \\
\bottomrule
\end{tabular}
\caption{Training setup for our ImageNet and \name training.}
\caption{Training setup and hyperparameters for our ImageNet training.}
\label{tab:in-setup}
\end{table}
\begin{table}[h]
\centering
\begin{tabular}{lccc}
\begin{tabular}{lcc}
\toprule
Dataset & Batch Size & Epochs & Learning Rate \\
Parameter & ViT, Swin, ResNet & DeiT \\
\midrule
Aircraft & 512 & 500 & 3e-4 \\
Cars & 1024 & 500 & 3e-4 \\
Flowers & 256 & 500 & 3e-4 \\
Food & 2048 & 100 & 3e-4 \\
Pets & 512 & 500 & 3e-4 \\
Image Resolution & $224 \times 224$ & $224 \times 224$ \\
Epochs & 300 & 300 \\
Learning Rate & 3e-3 & S/B: 1e-3, L: 5e-4 \\
Learning Rate Schedule & cosine decay & cosine decay \\
Batch Size & 2048 & 1024 \\
GPUs & $4\times$ NVIDIA A100/H100/H200 & $4\times$ NVIDIA A100/H100/H200 \\
Warmup Schedule & linear & linear \\
Warmup Epochs & 3 & 3 \\
Weight Decay & 0.02 & 0.05 \\
Label Smoothing & 0.1 & 0.1 \\
Optimizer & Lamb \cite{You2020} & AdamW \\
\cmidrule(r){1-1}
Data Augmentation Policy & \textbf{3-Augment \cite{Touvron2022}} & \textbf{DeiT \cite{Touvron2021b}} \\
Augmentations & \makecell{Resize \\ RandomCrop \\ HorizontalFlip \\ Grayscale \\ Solarize \\ GaussianBlur \\ ColorJitter \\ CutMix \cite{Yun2019}} & \makecell{RandomResizedCrop \\ HorizontalFlip \\ RandomErase \cite{Zhong2017} \\ RandAugment \cite{Cubuk2019} \\ ColorJitter \\ Mixup \cite{Zhang2018a} \\ CutMix \cite{Yun2019}} \\
\bottomrule
\end{tabular}
\caption{Training setup for finetuning on different downstream datasets. Other settings are the same as in \Cref{tab:in-setup}.}
\label{tab:downstream-setup}
\end{table}
\end{table*}
On ImageNet we use the same training setup as \cite{Nauen2023} and \cite{Touvron2022} without pretraining.
As our focus is on evaluating the changes in accuracy due to \schemename/\name, like \cite{Nauen2023}, we stick to one set of hyperparameters for all models.
We list the settings used for training on ImageNet and \name in \Cref{tab:in-setup} and the ones used for finetuning those weights on the downstream datasets in \Cref{tab:downstream-setup}.
\newpage
\section{Infill Model Comparison}
\label{sec:infill-model-comparison}
\begin{table}[h!]
\centering
\resizebox{\textwidth}{!}{\begin{tabular}{cc@{\hskip 0.3in}cc}
\caption{Training setup for finetuning on different downstream datasets. Other settings are the same as in \Cref{tab:in-setup}. For finetuning, we always utilize 3-Augment and the related parameters from the \emph{ViT, Swin, ResNet} column of \Cref{tab:in-setup}}
\label{tab:downstream-setup}
\begin{tabular}{lcccc}
\toprule
Dataset & Batch Size & Epochs & Learning Rate & Num. GPUs \\
\midrule
Aircraft & 512 & 500 & 3e-4 & 2 \\
Cars & 1024 & 500 & 3e-4 & 4 \\
Flowers & 256 & 500 & 3e-4 & 1 \\
Food & 2048 & 100 & 3e-4 & 4 \\
Pets & 512 & 500 & 3e-4 & 2 \\
\bottomrule
\end{tabular}
\end{table}
On ImageNet we use the same training setup as \cite{Nauen2025} and \cite{Touvron2022} without pretraining for ViT, Swin, and ResNet.
For DeiT, we train the same ViT architecture but using the data augmentation scheme and hyperparameters from \cite{Touvron2021b}.
As our focus is on evaluating the changes in accuracy due to \schemename, like \cite{Nauen2025}, we stick to one set of hyperparameters for all models.
We list the settings used for training on ImageNet in \Cref{tab:in-setup} and the ones used for finetuning those weights on the downstream datasets in \Cref{tab:downstream-setup}.
Out implementation is using PyTorch \cite{Paszke2019} and the \emph{timm} library \cite{Wightman2019} for model architectures and basic functions.
\begin{table*}[h!]
\centering
\caption{Hardware and Software specifics used for both training and evaluation.}
\label{tab:hw-sw-versions}
\begin{tabular}{ll}
\toprule
Parameter & Value \\
\midrule
GPU & NVIDIA A100/H100/H200 \\
CPU & 24 CPU cores (Intex Xenon) per GPU \\
Memory & up to 120GB per GPU \\
Operating System & Enroot container for SLURM based on Ubuntu 24.04 LTS \\
Python & 3.12.3 \\
PyTorch & 2.7.0 \\
TorchVision & 0.22.0 \\
Timm & 1.0.15 \\
\bottomrule
\end{tabular}
\end{table*}
\Cref{tab:hw-sw-versions} lists the specific hardware we use, as well as versions of the relevant software packages.
\section{Infill Model Comparison}
\begin{table*}[h!]
\centering
\caption{Example infills of LaMa and Attentive Eraser.}
\label{tab:infill-examples}
\resizebox{.9\textwidth}{!}{
\begin{tabular}{cc@{\hskip 0.3in}cc}
\toprule
LaMa & Att. Eraser & LaMa & Att. Eraser \\
\midrule
@@ -64,26 +161,47 @@ We list the settings used for training on ImageNet and \name in \Cref{tab:in-set
\includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00003097.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00003097.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00011629.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00011629.JPEG} \\
\includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00000547.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00000547.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00025256.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00025256.JPEG} \\
\bottomrule
\end{tabular}}
\caption{Example infills of LaMa and Attentive Eraser.}
\end{table}
\end{tabular}
}
\end{table*}
We visualize example infilled images for both LaMa \cite{Suvorov2021} and Attentive Eraser \cite{Sun2024} in \Cref{tab:infill-examples}.
We qualitatively find that while LaMa often leaves repeated textures of blurry spots where the object was erased, Attentive Eraser produces slightly cleaner and more coherent infills of the background.
\section{Images with High Infill Ratio}
\label{sec:high-infill-ratio}
\begin{table}[h!]
\newpage
\section{Image Infill Ratio}
\begin{table*}[h!]
\centering
\begin{tabular}{ccc}
\toprule
Infill Ratio & LaMa & Att. Eraser \\
\midrule
93.7 & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} \\ \\
95.7 & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} \\ \\
83.7 & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} \\ \\
88.2 & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00026530.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00026530.JPEG}}
\end{tabular}
\caption{Example infills with a large relative foreground area size that is infilled (infill ratio).}
\label{tbl:high-rat}
\end{table}
\resizebox{.8\textwidth}{!}{
\begin{tabular}{ccc}
\toprule
Infill Ratio & LaMa & Att. Eraser \\
\midrule
93.7 & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} \\ \\
95.7 & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} \\ \\
83.7 & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} \\ \\
88.2 & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00026530.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00026530.JPEG}}
\end{tabular}}
\end{table*}
\begin{figure}
\centering
\includegraphics[width=.9\textwidth]{img/infill_distr.pdf}
\caption{We plot the distribution of the relative size of the detected foreground object that is infilled in our Segmentation step of ImageNet.
While most images contain objects of smaller size, there is a peak where Grounded~SAM~\cite{Ren2024} detects almost the whole image as the foreground object. For examples of such large infills, see \Cref{tbl:high-rat}.
}
\label{fig:infill-distr}
\end{figure}
\Cref{tbl:high-rat} shows infills for images where Grounded SAM \cite{Ren2024} marks a high percentile of the image as the foreground object (Infill Ratio), that has to be erased by the infill models.
While LaMa tends to fill those spots with mostly black or gray and textures similar to what we saw in \Cref{tab:infill-examples}, Attentive Eraser tends to create novel patterns by copying what is left of the background all over the rest of the image.
% We filter out such mostly infilled background using our background pruning hyperparameter $t_\text{prune} = 0.8$.
\Cref{fig:infill-distr} plots the distribution of infill ratios in \schemename.
While there is a smooth curve of the number of detections decreasing with the infill ratio until $\approx 90\%$, there is an additional peak at $\approx 100\%$ infill ratio.
We believe that this peak is made up of failure cases of Grounded~SAM.
We filter out all backgrounds that have an infill ratio larger than our pruning threshold $t_\text{prune} = 0.8$, which translates to $10\%$ of backgrounds.

View File

@@ -4,8 +4,8 @@
\label{sec:conclusion}
We introduce \schemename, a novel data augmentation scheme that facilitates improved Transformer training for image classification.
By explicitly separating and recombining foreground objects and backgrounds, \schemename enables controlled data augmentation, leading to significant performance gains on ImageNet and downstream fine-grained classification tasks.
By explicitly separating and recombining foreground objects and backgrounds, \schemename enables controlled data augmentation beyond existing image compositions, leading to significant performance gains on ImageNet and downstream fine-grained classification tasks.
Furthermore, \schemename provides a powerful framework for analyzing model behavior and quantifying biases, including background robustness, foreground focus, center bias, and size bias.
Our experiments demonstrate that training on \name, the instantiation of \schemename on ImageNet, not only boosts accuracy but also significantly reduces these biases, resulting in more robust and generalizable models.
Our experiments demonstrate that training using \schemename not only boosts accuracy but also significantly reduces these biases, resulting in more robust and generalizable models.
In the future, we see \schemename be also applied to other datasets and tasks, like video recognition or segmentation.
\schemename's ability to both improve performance and provide insights into model behavior makes it a valuable tool for advancing CV research and developing more reliable AI systems.

View File

@@ -17,62 +17,96 @@
% \item Size bias
% \end{itemize}
We conduct a comprehensive suit of experiments to validate the effectiveness of our approach.
We compare training on \name, the ImageNet instantiation of \schemename, to training on ImageNet for 7 different models.
Furthermore, we assess the impact of using \name for pretraining on multiple fine-grained downstream datasets.
Additionally, we use \schemename's control over the image distribution to quantify some model behaviors and biases.
We conduct a comprehensive suit of experiments to validate the effectiveness of our approach,
% We compare training on \name, the ImageNet instantiation of \schemename, to training on ImageNet for 10 different models.
comparing ImageNet-training with and without \schemename for 10 different models.
Furthermore, we assess the impact of using \schemename for pretraining on multiple fine-grained downstream datasets.
Finally, we exploit \schemename's control over the image distribution to quantify model behaviors and biases.
We always report the mean and standard deviation of three independent training runs.
\subsection{Design Choices of \schemename}
\subsection{Design Choices of ForAug}
\label{sec:ablation}
We start by ablating the design choices of \schemename.
For this, we revert to TinyImageNet \cite{Le2015}, a subset of ImageNet containing 200 categories with 500 images each, and Tiny\name, a version of \schemename derived from TinyImageNet.
\Cref{tab:ablation} presents the results of these ablations.
We start by ablating the design choices of \schemename on TinyImageNet~\cite{Le2015}, a subset of ImageNet containing 200 categories with 500 images each. %, and Tiny\name, the application of \schemename to TinyImageNet.
% \Cref{tab:ablation} presents the results of these ablations.
\Cref{tab:ablation-segment} presents ablations for segmentation and \Cref{tab:ablation-recombine} for recombination.
\begin{table*}[t]
\begin{table}
\caption{Ablation of the design decisions in the segmentation phase of \schemename on TinyImageNet.
The first line is our baseline, while the other lines are using \schemename.
We use basic settings with the \emph{same} background strategy during recombination for this experiment.
}
\label{tab:ablation-segment}
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{lccccccccccccc}
\small
\resizebox{.9\columnwidth}{!}{
\begin{tabular}{cccc}
\toprule
\multirow{2}{*}{Dataset} & Detect. & Infill & FG. & Augmentation & BG. & BG. & edge & original & \multicolumn{2}{c}{TinyImageNet Accuracy} \\
& prompt & Model & size & Order & strategy & pruning & smoothing & image mixing & ViT-Ti [\%] & ViT-S [\%] \\
\cmidrule(r){1-1} \cmidrule(lr){2-9} \cmidrule(l){10-11}
TinyImageNet & & & & & & & & & $66.1\pm0.5$ & $68.3\pm0.7$ \\
Tiny\name & specific & LaMa \cite{Suvorov2021} & mean & crop$\to$paste$\to$color & same & - & - & \gtxt{-} & $64.6\pm0.5$ & $70.0\pm0.6$ \\
\gtxt{Tiny\name} & \gtxt{specific} & \gtxt{LaMa \cite{Suvorov2021}} & range & \gtxt{crop$\to$paste$\to$color} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $65.5\pm0.4$ & $71.2\pm0.5$ \\
\gtxt{Tiny\name} & general & \gtxt{LaMa \cite{Suvorov2021}} & \gtxt{range} & \gtxt{crop$\to$paste$\to$color} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $66.4\pm0.6$ & $72.9\pm0.6$ \\
\gtxt{Tiny\name} & \gtxt{general} & Att. Eraser \cite{Sun2024} & \gtxt{range} & \gtxt{crop$\to$paste$\to$color} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.5\pm1.2$ & $72.4\pm0.5$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & paste$\to$crop$\to$color & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.1\pm1.2$ & $72.9\pm0.5$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & 1.0 & \gtxt{-} & \gtxt{-} & $67.0\pm1.2$ & $73.0\pm0.3$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & 0.8 & \gtxt{-} & \gtxt{-} & $67.2\pm1.2$ & $72.9\pm0.8$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & 0.6 & \gtxt{-} & \gtxt{-} & $67.5\pm1.0$ & $72.8\pm0.7$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 2.0$ & \gtxt{-} & $67.2\pm0.4$ & $72.9\pm0.5$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{-} & $65.9\pm0.5$ & $72.4\pm0.6$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & $p=0.2$ & $69.8\pm0.5$ & $75.0\pm0.3$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & $p=0.33$ & $69.5\pm0.4$ & $75.2\pm1.0$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & $p=0.5$ & $70.3\pm1.0$ & $74.2\pm0.2$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & linear & $70.1\pm0.7$ & $74.9\pm0.8$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & reverse lin. & $67.6\pm0.2$ & $73.2\pm0.3$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & cos & $71.3\pm1.0$ & $75.7\pm0.8$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{cos} & $70.0\pm0.8$ & $75.5\pm0.7$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & orig. & \gtxt{0.8} & \gtxt{$\sigma_\text{max} = 4.0$} & \gtxt{cos} & $67.2\pm0.9$ & $69.9\pm1.0$ \\
\gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & all & \gtxt{0.8} & \gtxt{$\sigma_\text{max} = 4.0$} & \gtxt{cos} & $70.1\pm0.7$ & $77.5\pm0.6$ \\
\multirow{2.5}{*}{\makecell{Detect. \\Prompt}} & \multirow{2.5}{*}{\makecell{Infill \\ Model}} & \multicolumn{2}{c}{TinyImageNet Accuracy [\%]} \\
\cmidrule{3-4}
& & ViT-Ti & ViT-S \\
\midrule
\name & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & \gtxt{cos} & - & $80.5\pm0.1$ \\
\gtxt{\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{cos} & - & $80.7\pm0.1$ \\
\gtxt{\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & all & \gtxt{0.8} & \gtxt{$\sigma_\text{max} = 4.0$} & \gtxt{cos} & - & $81.3\pm0.1$ \\
\multicolumn{2}{l}{\textbf{TinyImageNet}} & $66.1 \pm 0.5$ & $68.3 \pm 0.7$ \\
specific & LaMa \cite{Suvorov2021} & $65.5 \pm 0.4$ & $71.2 \pm 0.5$ \\
general & \gtxt{LaMa \cite{Suvorov2021}} & $66.4 \pm 0.6$ & $72.9 \pm 0.6$ \\
\gtxt{general} & Att. Eraser \cite{Sun2024} & $67.5 \pm 1.2$ & $72.4 \pm 0.5$ \\
\bottomrule
\end{tabular}}
\caption{Ablation of design decisions of Tiny\name on TinyImageNet and \name on ImageNet.}
\label{tab:ablation}
\end{table*}
\end{table}
\begin{table}[t]
\caption{Ablation of the recombination phase of \schemename on TinyImageNet (top) and ImageNet (bottom). The first experiments use the initial segmentation settings with LaMa \cite{Suvorov2021}.}
\label{tab:ablation-recombine}
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{ccccccccccc}
\toprule
% FG. & Augment. & BG. & BG. & Edge & Original & \multicolumn{2}{c}{Accuracy [\%]} \\
% Size & Order & Strat. & Prune & Smoothing & Mixing & ViT-Ti & ViT-S \\
\multirow{2.5}{*}{\makecell{FG. \\size}} & \multirow{2.5}{*}{\makecell{Augment.\\Order}} & \multirow{2.5}{*}{\makecell{BG\\Strat.}} & \multirow{2.5}{*}{\makecell{BG.\\Prune}} & \multirow{2.5}{*}{\makecell{Original\\Mixing}} & \multirow{2.5}{*}{\makecell{Edge\\Smooth.}} & \multicolumn{2}{c}{Accuracy [\%]} \\
\cmidrule{7-8}
& & & & & & ViT-Ti & ViT-S \\
\midrule
% TinyImageNet & & & & & & & $66.1\pm0.5$ & $68.3\pm0.7$ \\
\multicolumn{6}{l}{\textbf{TinyImageNet}} & \gtxt{$66.1\pm0.5$} & \gtxt{$68.3\pm0.7$} \\
mean & crop$\to$paste & same & - & - & \gtxt{-} & $64.6\pm0.5$ & $70.0\pm0.6$ \\
range & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $65.5\pm0.4$ & $71.2\pm0.5$ \\
\midrule
% \gtxt{range} & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $66.4\pm0.6$ & $72.9\pm0.6$ \\
{range} & {crop$\to$paste} & {same} & {-} & {-} & {-} & $67.5\pm1.2$ & $72.4\pm0.5$ \\
\gtxt{range} & paste$\to$crop & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.1\pm1.2$ & $72.9\pm0.5$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 1.0 & \gtxt{-} & \gtxt{-} & $67.0\pm1.2$ & $73.0\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.8 & \gtxt{-} & \gtxt{-} & $67.2\pm1.2$ & $72.9\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.6 & \gtxt{-} & \gtxt{-} & $67.5\pm1.0$ & $72.8\pm0.7$ \\
% \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 2.0$ & \gtxt{-} & $67.2\pm0.4$ & $72.9\pm0.5$ \\
% \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{-} & $65.9\pm0.5$ & $72.4\pm0.6$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.2$ & \gtxt{-} & $69.8\pm0.5$ & $75.0\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.33$ & \gtxt{-} & $69.5\pm0.4$ & $75.2\pm1.0$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.5$ & \gtxt{-} & $70.3\pm1.0$ & $74.2\pm0.2$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & linear & \gtxt{-} & $70.1\pm0.7$ & $74.9\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & reverse lin. & \gtxt{-} & $67.6\pm0.2$ & $73.2\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & cos & \gtxt{-} & $71.3\pm1.0$ & $75.7\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & $70.0\pm0.8$ & $75.5\pm0.7$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & orig. & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $67.2\pm0.9$ & $69.9\pm1.0$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $70.1\pm0.7$ & $77.5\pm0.6$ \\
\midrule
\multicolumn{6}{l}{\textbf{ImageNet}} & \gtxt{-} & \gtxt{$79.1\pm0.1$} \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & \gtxt{-} & - & $80.5\pm0.1$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & - & $80.7\pm0.1$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & - & $81.4\pm0.1$ \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Prompt.}
% We present the ablation of our main design decisions in \Cref{tab:ablation}.
First, we evaluate the type of prompt used to detect the foreground object.
Here, the \emph{general} prompt, which contains the class and the more general object category, outperforms only having the class name (\emph{specific}).
\textbf{Inpainting.} Attentive Eraser \cite{Sun2024} produces superior results compared to LaMa \cite{Suvorov2021} (see \Cref{sec:infill-model-comparison} for examples).
\textbf{Inpainting.} Among inpainting models, Attentive Eraser~\cite{Sun2024} produces slightly better results compared to LaMa~\cite{Suvorov2021} ($+0.5$ p.p. on average).
For inpainting examples, see the supplementary material.
% (see the supplementary material for examples).
% When comparing the infill models, the GAN-based LaMa \cite{Suvorov2021} gets outperformed by the Attentive Eraser \cite{Sun2024}.
\textbf{Foreground size}
@@ -86,22 +120,26 @@ This suggests that the added variability is beneficial.
\textbf{Order of data augmentation.}
% (1) Applying the image crop related augmentations \emph{before} pasting the foreground object and the color-based ones \emph{after} pasting or (2) applying all data augmentations after pasting the foreground object.
% While results are ambiguous, we choose the second strategy, as it improves the performance of ViT-S, although not the one of ViT-Ti.
Applying all augmentations after foreground-background recombination (\emph{paste$\to$crop$\to$color}) slightly improves ViT-S's performance compared to applying crop-related augmentations before pasting (\emph{crop$\to$paste$\to$color}).
For ViT-Ti, the results are ambiguous.
Applying all augmentations after foreground-background recombination (\emph{paste$\to$crop$\to$color}) improves ViT-S's performance compared to applying crop-related augmentations before pasting (\emph{crop$\to$paste$\to$color}).
ViT-Ti results are ambiguous.
\textbf{Background pruning.}
When it comes to the choice of backgrounds to use, we test two pruning thresholds ($t_\text{prune}$) to exclude backgrounds with excessive inpainting.
When it comes to the backgrounds to use, we test different pruning thresholds ($t_\text{prune}$) to exclude backgrounds with large inpainting.
% and only use backgrounds with an relative size of the infilled region of at most $t_\text{prune}$ (exclusive).
A threshold of $t_\text{prune}=1.0$ means that we use all backgrounds that are not fully infilled.
% We find that the background pruning does not significantly impact the models' performance.
% We choose $t_\text{prune}=0.8$ for the following experiments to exclude backgrounds that are mostly artificial.
Varying $t_\text{prune}$ has minimal impact.
Therefore, we choose $t_\text{prune} = 0.8$ to exclude predominantly artificial backgrounds.
Similarly, applying edge smoothing to foreground masks with Gaussian blurring actually hurts performance on Tiny\name, but slightly improves it on \name.
We choose $t_\text{prune} = 0.8$ to exclude predominantly artificial backgrounds.
% One of the most important design decisions is the mixing of the original dataset with \name.
\textbf{Mixing} \name with the original ImageNet data proves crucial.
While constant and linear mixing schedules improve performance over no mixing by $2-3$ p.p. compared to only using Tiny\name, the cosine annealing schedule yields the best results, boosting accuracy by another $0.5-1$ p.p.
\textbf{Mixing} \schemename-augmented samples with the original ImageNet data proves crucial.
While constant and linear mixing schedules improve performance over no mixing by $2-3$ p.p. compared to only augmented samples, the cosine annealing schedule proves optimal, boosting accuracy by $3-4$ p.p.
\textbf{Edge smoothing.}
We evaluate the impact of using Gaussian blurring to smooth the edges of the foreground masks.
% Similarly, applying edge smoothing to foreground masks with Gaussian blurring actually hurts performance on Tiny\name, but slightly improves it on \name.
For larger models, this gives us a slight performance boost on the full ImageNet (second to last line in \Cref{tab:ablation-recombine}).
\textbf{Background strategy.}
Another point is the allowed choice of background image for each foreground object.
@@ -116,302 +154,344 @@ Another point is the allowed choice of background image for each foreground obje
We compare using the original background, a background from the same class, and any background.
These strategies go from low diversity and high shared information content between the foreground and background to high diversity and low shared information content.
For \emph{ViT-Ti}, the latter two strategies perform comparably, while \emph{ViT-S} benefits from the added diversity of using any background.
The same is true when training on the full (ImageNet) version of \name.
The same is true when training on the full ImageNet.
\begin{figure}
\centering
\includegraphics[width=.7\columnwidth]{img/bates.pdf}
\caption{Plot of the probability distribution function (PDF) of the extended Bates distribution for different parameters $\eta$. Higher values of $\eta$ concentrate the distribution around the center.}
\label{fig:bates-pdf}
\end{figure}
\begin{table}
\caption{Accuracy of ViT-S on TinyImageNet (TIN) in percent using \schemename with different foreground position distributions by varying the Bates parameter $\eta$.
The best performance is achieved when using the uniform distribution ($\eta=1$) for training.}
\label{tbl:foreground-eta}
\centering
\resizebox{\columnwidth}{!}{
\small
\resizebox{.9\columnwidth}{!}{
\begin{tabular}{ccccccc}
\toprule
\multirow{2.5}{*}{\makecell{Training Set/ \\ Bates Parameter}} & \multirow{2.5}{*}{TIN} & \multicolumn{5}{c}{Tiny\name} \\
\multirow{2.5}{*}{\makecell{Bates Parameter \\during training}} & \multirow{2.5}{*}{\makecell{TIN \\w/o \schemename}} & \multicolumn{5}{c}{TIN w/ \schemename} \\
\cmidrule(l){3-7}
& & $\eta=-3$ & $-2$ & $1/-1$ & $2$ & $3$ \\
& & $\eta=-3$ & $-2$ & $1/-1$ & $2$ & $3$ \\
\midrule
TinyImageNet & 68.9 & 60.5 & 60.2 & 60.8 & 62.6 & 63.1 \\
$\eta=-3$ & 71.3 & 79.3 & 79.5 & 79.1 & 79.3 & 79.1 \\
$\eta=-2$ & 71.5 & 80.0 & 78.7 & 79.3 & 79.1 & 78.8 \\
$\eta=1/-1$ & 72.3 & 79.5 & 78.9 & 80.2 & 79.7 & 80.4 \\
$\eta=2$ & 71.3 & 78.2 & 77.8 & 79.1 & 79.6 & 79.9 \\
$\eta=3$ & 71.4 & 77.2 & 76.9 & 78.6 & 79.6 & 79.7 \\
Baseline & 68.9 & 60.5 & 60.2 & 60.8 & 62.6 & 63.1 \\
$\eta=-3$ & 71.3 & 79.3 & 79.5 & 79.1 & 79.3 & 79.1 \\
$\eta=-2$ & 71.5 & 80.0 & 78.7 & 79.3 & 79.1 & 78.8 \\
$\eta=1/-1$ & 72.3 & 79.5 & 78.9 & 80.2 & 79.7 & 80.4 \\
$\eta=2$ & 71.3 & 78.2 & 77.8 & 79.1 & 79.6 & 79.9 \\
$\eta=3$ & 71.4 & 77.2 & 76.9 & 78.6 & 79.6 & 79.7 \\
\bottomrule
\end{tabular}}
\caption{Accuracy of ViT-S trained on TinyImageNet (TIN) and Tiny\name with different foreground position distributions by varying the parameter of a Bates distribution $\eta$.
The best performance is achieved using the uniform distribution ($\eta=1$).}
\end{table}
\textbf{Foreground position.}
Finally, we analyze the foreground object's positioning in the image.
We utilize an extended Bates distribution to sample the position of the foreground object.
The Bates distribution~\cite{Bates1955} with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}.
Therefore, the larger $\eta$, the more concentrated the distribution is around the center.
We extend this concept to $\eta \leq -1$ by defining ${X \sim \text{Bates}(\eta) :\Leftrightarrow s(X) \sim \text{Bates}(-\eta)}$ for $\eta \leq 1$ with $s$ being the sawtooth function on $[0, 1]$:
\begin{align}
s(x) = \begin{cases}
x + 0.5 & \text{if } 0 < x < 0.5 \\
x - 0.5 & \text{if } 0.5 \leq x \leq 1
\end{cases}
\end{align}
Note that $s \circ s = \id$ on $[0, 1]$.
This way, distributions with $\eta \leq -1$ are more concentrated around the borders.
$\eta = 1$ and $\eta = -1$ both correspond to the uniform distribution.
The PDF of this extended Bates distribution is visualized in \Cref{fig:bates-pdf}.
When sampling more towards the center of the image, the difficulty of the task is reduced, which then reduces the performance on TinyImageNet.
This is reflected in the performance when evaluating on Tiny\name with $\eta=2$ and $\eta=3$ compared to $\eta=-1/1$.
Finally, we analyze the foreground object's positioning in the image, using a
generalization of the Bates distribution~\cite{Bates1955} with parameter $\eta \in \Z$.
The Bates distribution presents an easy way to sample from a bounded domain with just one hyperparameter that controls its concentration.
$\eta = 1/-1$ corresponds to the uniform distribution; $\eta > 1$ concentrates the distribution around the center; and for $\eta < -1$, the distribution is concentrated at the borders (see supplementary material for details).
% We utilize an extended Bates distribution to sample the position of the foreground object.
% The Bates distribution with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}.
% The larger $\eta$, the more concentrated the distribution is at the center, $\eta < -1$ concentrates the distribution at the edges.
% We extend this concept to $\eta \leq -1$, shifting the distribution away from the center and towards the edges.
When sampling more towards the center of the image, the difficulty of the task is reduced, which reduces performance on TinyImageNet (\Cref{tbl:foreground-eta}).
This is reflected in the performance when evaluating using \schemename with $\eta=2$ and $\eta=3$ compared to $\eta=-1/1$.
We observe a similar reduction for $\eta < -1$.
This experiment is conducted using the LaMa infill model.
% This experiment is conducted using the LaMa infill model.
\begin{table}
\centering
\small
\begin{tabular}{lccc}
\toprule
Dataset & Classes & \makecell{Training \\ Images} & \makecell{Validation \\ Images} \\
\midrule
TinyImageNet & 200 & 100,000 & 10,000 \\
Tiny\name & 200 & 99,404 & 9,915 \\
ImageNet & 1,000 & 1,281,167 & 50,000 \\
\name & 1,000 & 1,274,557 & 49,751 \\
\bottomrule
\end{tabular}
\caption{Dataset statistics for TinyImageNet, Tiny\name, ImageNet, and \name. For \name and Tiny\name we report the number of foreground/background pairs.}
\caption{Dataset statistics for TinyImageNet and ImageNet with and without \schemename. For \schemename we report the number of foreground/background pairs.}
\label{tab:dataset-stats}
\centering
\resizebox{.9\columnwidth}{!}{
\begin{tabular}{l S[table-format=4.0] S[table-format=7.0] S[table-format=5.0]}
\toprule
Dataset & {Classes} & {\makecell{Training \\ Images}} & {\makecell{Validation \\ Images}} \\
\midrule
TinyImageNet & 200 & 100000 & 10000 \\
TinyImageNet + \schemename & 200 & 99404 & 9915 \\
ImageNet & 1000 & 1281167 & 50000 \\
ImageNet + \schemename & 1000 & 1274557 & 49751 \\
\bottomrule
\end{tabular}}
\end{table}
After fixing the optimal design parameters in \Cref{tab:ablation} (last row), we construct the full \name dataset using the entire ImageNet dataset.
\Cref{tab:dataset-stats} compares the dataset statistics of ImageNet and \name.
After fixing the optimal design parameters in \Cref{tab:ablation-segment,tab:ablation-recombine} (last rows), we run \schemename's segmentation step on the entire ImageNet dataset.
\Cref{tab:dataset-stats} shows the resulting dataset statistics.
% The slightly lower number of images in \name is due to \emph{Grounded SAM} returning no or invalid detections for some images.
The slightly reduced image count in \name is due to instances where Grounded SAM failed to produce valid object detections.
The slightly reduced image count for \schemename is due to instances where Grounded SAM fails to produce valid segmentation masks.
\subsection{Image Classification Results}
\begin{table}
\centering
\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy \\ when trained on}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& ImageNet & \name & \\
\midrule
ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{+2.3} \\
ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{+3.5} \\
ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{+4.5} \\
\midrule
Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{+1.8} \\
Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{+1.2} \\
\midrule
ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{+0.5} \\
ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{+1.0} \\
\bottomrule
\end{tabular}
\caption{ImageNet results of models trained on \name and on ImageNet directly. \name improves the performance of all models in our test.}
\caption{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.}
\label{tab:imagenet-results}
\end{table}
\Cref{tab:imagenet-results} compares the ImageNet performance of models trained on \name and ones trained directly on ImageNet.
We adopt the training setup of \cite{Nauen2023} and \cite{Touvron2022} (details in \Cref{sec:training_setup}) for training ViT \cite{Dosovitskiy2021}, Swin \cite{Liu2021} and ResNet \cite{He2016} models.
Notably, \name improves performance across all tested architectures, including the ResNet models (up to $1$ p.p.), demonstrating benefits beyond Transformers.
For Transformer models, we observe improvements from $1.2$ p.p. to $4.5$ p.p.
This improvement is more substantial for the larger models, with ViT-L gaining $4.5$ p.p. in accuracy.
\name's improvements mostly counteract the drop in performance due to overfitting for large models.
When training on ImageNet, this drop is $3.8$ p.p. from ViT-S to ViT-L, while for \name it is reduced to $1.6$ p.p.
\begin{table}
\centering
\resizebox{\columnwidth}{!}{\begin{tabular}{lccccc}
\small
\resizebox{.8\columnwidth}{!}{\begin{tabular}{lccc}
\toprule
Model & Aircraft & Cars & Flowers & Food & Pets \\
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& w/o \schemename & w/ \schemename & \\
\midrule
ViT-S @ ImageNet & $72.4\pm1.0$ & $89.8\pm0.3$ & $94.5\pm0.2$ & $89.1\pm0.1$ & $93.8\pm0.2$ \\
ViT-S @ \name & $78.6\pm0.5$ & $92.2\pm0.2$ & $95.5\pm0.2$ & $89.6\pm0.1$ & $94.5\pm0.2$ \\
& \grntxt{+6.2} & \grntxt{+2.4} & \grntxt{+1.0} & \grntxt{+0.5} & \grntxt{+0.7} \\
\cmidrule(r){1-1}
ViT-B @ ImageNet & $71.7\pm0.5$ & $90.0\pm0.2$ & $94.8\pm0.4$ & $89.8\pm0.2$ & $94.1\pm0.4$ \\
ViT-B @ \name & $79.0\pm2.2$ & $93.3\pm0.1$ & $ 96.5\pm0.1$ & $90.9\pm0.1$ & $95.1\pm0.4$ \\
& \grntxt{+7.3} & \grntxt{+3.3} & \grntxt{+1.7} & \grntxt{+1.1} & \grntxt{+1.0} \\
\cmidrule(r){1-1}
ViT-L @ ImageNet & $72.1\pm1.0$ & $88.8\pm0.3$ & $94.4\pm0.3$ & $90.1\pm0.2$ & $94.2\pm0.4$ \\
ViT-L @ \name & $77.6\pm1.2$ & $89.1\pm0.2$ & $96.6\pm0.1$ & $91.3\pm0.1$ & $95.1\pm0.1$ \\
& \grntxt{+5.5} & \grntxt{+0.3} & \grntxt{+2.2} & \grntxt{+1.2} & \grntxt{+0.9} \\
ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{$+2.3$} \\
ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{$+3.5$} \\
ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{$+4.5$} \\
\midrule
Swin-Ti @ ImageNet & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\
Swin-Ti @ \name & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\
& \grntxt{+4.1} & \grntxt{+2.5} & \grntxt{+0.3} & \grntxt{+0.4} & \grntxt{+0.6} \\
\cmidrule(r){1-1}
Swin-S @ ImageNet & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\
Swin-S @ \name & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\
& \grntxt{+5.7} & \grntxt{+2.1} & \grntxt{+1.4} & \grntxt{+0.1} & \grntxt{+0.5} \\
DeiT-S & $80.1 \pm 0.1$ & $80.0\pm0.3$ & \gtxt{$-0.1$} \\
DeiT-B & $81.9 \pm 0.3$ & $81.9\pm0.2$ & \gtxt{$\pm0.0$} \\
DeiT-L & $79.3\pm2.3$ & $82.4\pm0.1$ & \grntxt{$+3.1$} \\
\midrule
ResNet-50 @ ImageNet & $78.2\pm0.5$ & $89.8\pm0.2$ & $91.7\pm0.4$ & $84.4\pm0.2$ & $93.7\pm0.3$ \\
ResNet-50 @ \name & $80.3\pm0.4$ & $90.4\pm0.2$ & $91.7\pm0.2$ & $84.5\pm0.2$ & $93.7\pm0.3$ \\
& \grntxt{+2.1} & \grntxt{+0.6} & \gtxt{$\pm$0} & \grntxt{+0.1} & \gtxt{$\pm$0} \\
\cmidrule(r){1-1}
ResNet-101 @ ImageNet & $78.4\pm0.6$ & $90.3\pm0.1$ & $91.2\pm0.5$ & $86.0\pm0.2$ & $94.3\pm0.2$ \\
ResNet-101 @ \name & $81.4\pm0.5$ & $91.3\pm0.1$ & $92.9\pm0.2$ & $86.3\pm0.1$ & $94.0\pm0.3$ \\
& \grntxt{+3.0} & \grntxt{+1.3} & \grntxt{+1.7} & \grntxt{+0.3} & \textcolor{red}{-0.3} \\
Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{$+1.8$} \\
Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{$+1.2$} \\
\midrule
ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{$+0.5$} \\
ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\
\bottomrule
\end{tabular}}
\caption{Downstream accuracy in percent when finetuning on other datasets. Models were pretrained on \name and ImageNet. Pretraining on \name increases Transformer downstream accuracy on all datasets.}
\end{table}
To assess the transferability of \name-trained models, we finetune models pretrained on ImageNet and \name on five fine-grained datasets:
FGVC-Aircraft \cite{Maji2013}, Stanford Cars~\cite{Dehghan2017}, Oxford Flowers \cite{Nilsback2008}, Food-101 \cite{Kaur2017}, and Oxford-IIIT Pets \cite{Parkhi2012}.
While for ResNets, the performance of both training datasets is about the same, for every Transformer, we see the accuracy improve on all downstream dataset by up to 7.3 p.p. and a reduction of error rate of up to $39.3\%$.
In summary, these results demonstrate that the improved representation learning achieved by training on \name translates to superior performance not only on ImageNet, but also on a variety of fine-grained image classification tasks.
\Cref{tab:imagenet-results} compares the ImageNet performance of models trained with and without \schemename.
We adopt the training setup of \cite{Nauen2025} and \cite{Touvron2022} for training ViT \cite{Dosovitskiy2021}, Swin \cite{Liu2021} and ResNet \cite{He2016} (representing CNNs) models as well as the setup of DeiT \cite{Touvron2021b} for that model.
Both setups are using strong data augmentations like RandAugment, CutMix, and Mixup optimized for Transformers (details in supplementary material).
Notably, \schemename improves performance across all tested architectures, including the ResNet models, % (up to $1$ p.p.),
demonstrating benefits beyond Transformers.
For DeiT we only observe benefits on ImageNet for the larger models.
For other transformers, we observe improvements from $1.2$ p.p. to $4.5$ p.p. with increasing gains for larger models.
% This improvement is more substantial for the larger models, with ViT-L gaining $4.5$ p.p. in accuracy.
\schemename's improvements counteract the drop in performance for increasing model sizes.
Without \schemename this drop is $3.8$ p.p. (ViT-S to L), while with \schemename it is reduced to $1.6$ p.p.
For DeiT there is a drop of $0.8$ p.p. from small to large while when using \schemename there is a \emph{gain} of $2.4$ p.p.
\subsection{Further Model Evaluation}
% Additional to just using \name for training, its special properties and posibilities for adjustment of the data distribution make it a valuable tool for evaluating other model properties and biases.
Beyond its use for training, \name's unique properties and controlled data generation capabilities make it a powerful tool for analyzing model behavior and biases.
\paragraph*{Background Robustness}
\begin{table}
\caption{Comparison of \schemename and simple Copy-Paste methods. We train ViT-S on ImageNet using the same 3-augment data augmentation on top of the copy-paste augmentation.}
\label{tab:copy-paste-comparison}
\centering
\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Background Robustness \\ when trained on}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& ImageNet & \name & \\
\midrule
ViT-S & $0.73\pm0.01$ & $0.99\pm0.01$ & \grntxt{+0.26} \\
ViT-B & $0.72\pm0.01$ & $1.00\pm0.01$ & \grntxt{+0.28} \\
ViT-L & $0.70\pm0.01$ & $1.00\pm0.01$ & \grntxt{+0.30} \\
\midrule
Swin-Ti & $0.72\pm0.01$ & $1.00\pm0.01$ & \grntxt{+0.28} \\
Swin-S & $0.72\pm0.01$ & $1.00\pm0.01$ & \grntxt{+0.28} \\
\midrule
ResNet-50 & $0.79\pm0.01$ & $0.99\pm0.01$ & \grntxt{+0.20} \\
ResNet-101 & $0.79\pm0.01$ & $1.00\pm0.01$ & \grntxt{+0.21} \\
\bottomrule
\end{tabular}
\caption{Evaluation of the background robustness of models trained on \name and on ImageNet directly. Training on \name improves the background robustness of all model to $\approx1.00$, meaning the model is indifferent to the choice of background.}
\label{tab:background-robustness}
\resizebox{\columnwidth}{!}{
\begin{tabular}{lcc S[table-format=+2.1,retain-explicit-plus,detect-inline-weight=math,detect-weight=true]}
\toprule
Augmentation & labels & \makecell{ Accuracy [\%]} & {\makecell{Delta \\to Prev.}} \\
\midrule
% Baseline & & $79.1 \pm 0.1$ \\
Baseline + \textbf{Simple Copy-Paste} & bg & $31.3 \pm 0.6$ & \\
+ mixed labels & fg + bg & $32.0 \pm 0.8$ & +0.7 \\
+ fg labels & fg & $31.6 \pm 0.9$ & -0.4 \\
+ \emph{range} foreground size variation & \gtxt{fg} & $43.0 \pm 1.2$ & \bfseries +11.4 \\
+ infilled backgrounds & \gtxt{fg} & $68.7 \pm 0.2$ & \bfseries +25.7 \\
+ \emph{cos} mixing strategy & \gtxt{fg} & $81.2 \pm 0.1$ & \bfseries +12.5 \\
+ edge smoothing & \gtxt{fg} & $81.3 \pm 0.1$ & +0.1 \\
+ background pruning$=$ \textbf{\schemename} & \gtxt{fg} & $81.4 \pm 0.1$ & +0.1 \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Comparison to Simple Copy-Paste.}
We compare \schemename to a simple adaption of the Copy-Paste augmentation inspired by \cite{Ge2023,Ghiasi2020,Shermaine2025} in \Cref{tab:copy-paste-comparison}.
Contrary to semantic segmentation we do not have foreground masks available.
Thus, we paste the extracted foreground objects from \emph{\schemename's segmentation stage} onto normal ImageNet images.
% Since such images do not have straight forward classification labels, we test multiple possibilities.
We observe 3 large jumps in accuracy: (\textbf{1}) From our \emph{range} foreground size variation (+11.4\%), (\textbf{2}) from using our infilled backgrounds instead of images from the dataset (+25.7\%), and (\textbf{3}) from our \emph{cos} mixing strategy with non-augmented images (+12.5\%).
\schemename's changes to the naive copy-paste augmentation are thus imperative for good classification performance.
\begin{table}[t]
\caption{Downstream accuracy in percent when finetuning on other datasets. Models are pretrained on ImageNet with and without \schemename. Pretraining using \schemename increases transformer downstream accuracy.
% on all datasets.
}
\label{tab:downstream-results}
\centering
\resizebox{\columnwidth}{!}{\begin{tabular}{lcccccc}
\toprule
Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\
\midrule
ViT-S & \xmark & $72.4\pm1.0$ & $89.8\pm0.3$ & $94.5\pm0.2$ & $89.1\pm0.1$ & $93.8\pm0.2$ \\
ViT-S & \cmark & $78.6\pm0.5$ & $92.2\pm0.2$ & $95.5\pm0.2$ & $89.6\pm0.1$ & $94.5\pm0.2$ \\
& & \grntxt{$+6.2$} & \grntxt{$+2.4$} & \grntxt{$+1.0$} & \grntxt{$+0.5$} & \grntxt{$+0.7$} \\
\cmidrule(r){1-1}
ViT-B & \xmark & $71.7\pm0.5$ & $90.0\pm0.2$ & $94.8\pm0.4$ & $89.8\pm0.2$ & $94.1\pm0.4$ \\
ViT-B & \cmark & $79.0\pm2.2$ & $93.3\pm0.1$ & $ 96.5\pm0.1$ & $90.9\pm0.1$ & $95.1\pm0.4$ \\
& & \grntxt{$+7.3$} & \grntxt{$+3.3$} & \grntxt{$+1.7$} & \grntxt{$+1.1$} & \grntxt{$+1.0$} \\
\cmidrule(r){1-1}
ViT-L & \xmark & $72.1\pm1.0$ & $88.8\pm0.3$ & $94.4\pm0.3$ & $90.1\pm0.2$ & $94.2\pm0.4$ \\
ViT-L & \cmark & $77.6\pm1.2$ & $89.1\pm0.2$ & $96.6\pm0.1$ & $91.3\pm0.1$ & $95.1\pm0.1$ \\
& & \grntxt{$+5.5$} & \grntxt{$+0.3$} & \grntxt{$+2.2$} & \grntxt{$+1.2$} & \grntxt{$+0.9$} \\
\midrule
DeiT-S & \xmark & $75.3\pm0.4$ & $91.1\pm0.2$ & $94.8\pm0.4$ & $89.2\pm0.2$ & $92.4\pm0.2$ \\
DeiT-S & \cmark & $76.8\pm0.8$ & $91.9\pm0.2$ & $95.2\pm0.3$ & $89.1\pm0.2$ & $92.3\pm0.4$ \\
& & \grntxt{$+1.5$} & \grntxt{$+0.8$} & \grntxt{$+0.4$} & \gtxt{$-0.1$} & \gtxt{$-0.1$} \\
\cmidrule(r){1-1}
DeiT-B & \xmark & $77.0\pm1.2$ & $92.9\pm0.2$ & $96.1\pm0.2$ & $91.2\pm0.1$ & $93.3\pm0.4$ \\
DeiT-B & \cmark & $79.3\pm0.3$ & $93.1\pm0.1$ & $96.4\pm0.2$ & $91.3\pm0.1$ & $93.3\pm0.1$ \\
& & \grntxt{$+2.3$} & \gtxt{$+0.2$} & \grntxt{$+0.3$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
\cmidrule(r){1-1}
DeiT-L & \xmark & $72.8\pm5.5$ & $92.8\pm1.0$ & $95.8\pm1.5$ & $90.5\pm2.6$ & $92.4\pm2.0$ \\
DeiT-L & \cmark & $78.8\pm0.8$ & $93.8\pm0.2$ & $97.0\pm0.2$ & $92.0\pm0.2$ & $93.5\pm0.2$ \\
& & \grntxt{$+6.0$} & \grntxt{$+1.0$} & \grntxt{$+1.2$} & \grntxt{$+1.5$} & \grntxt{$+1.1$} \\
\midrule
Swin-Ti & \xmark & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\
Swin-Ti & \cmark & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\
& & \grntxt{$+4.1$} & \grntxt{$+2.5$} & \grntxt{$+0.3$} & \grntxt{$+0.4$} & \grntxt{$+0.6$} \\
\cmidrule(r){1-1}
Swin-S & \xmark & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\
Swin-S & \cmark & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\
& & \grntxt{$+5.7$} & \grntxt{$+2.1$} & \grntxt{$+1.4$} & \gtxt{$+0.1$} & \grntxt{$+0.5$} \\
\midrule
ResNet-50 & \xmark & $78.2\pm0.5$ & $89.8\pm0.2$ & $91.7\pm0.4$ & $84.4\pm0.2$ & $93.7\pm0.3$ \\
ResNet-50 & \cmark & $80.3\pm0.4$ & $90.4\pm0.2$ & $91.7\pm0.2$ & $84.5\pm0.2$ & $93.7\pm0.3$ \\
& & \grntxt{$+2.1$} & \grntxt{$+0.6$} & \gtxt{$\pm0.0$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
\cmidrule(r){1-1}
ResNet-101 & \xmark & $78.4\pm0.6$ & $90.3\pm0.1$ & $91.2\pm0.5$ & $86.0\pm0.2$ & $94.3\pm0.2$ \\
ResNet-101 & \cmark & $81.4\pm0.5$ & $91.3\pm0.1$ & $92.9\pm0.2$ & $86.3\pm0.1$ & $94.0\pm0.3$ \\
& & \grntxt{$+3.0$} & \grntxt{$+1.3$} & \grntxt{$+1.7$} & \grntxt{$+0.3$} & \textcolor{red}{$-0.3$} \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Downstream tasks.} To assess the transferability of \schemename-trained models, we finetune models pretrained on ImageNet with and without \schemename on five fine-grained datasets:
FGVC-Aircraft \cite{Maji2013}, Stanford Cars~\cite{Dehghan2017}, Oxford Flowers \cite{Nilsback2008}, Food-101 \cite{Kaur2017}, and Oxford-IIIT Pets \cite{Parkhi2012}.
% While for ResNets, the performance of both training datasets is about the same,
In \Cref{tab:downstream-results} we see transformer accuracies improve on all these datasets by up to 7.3 p.p.
% and a reduction of error rate of up to $39.3\%$.
% Notably, training with \name increases the downstream performance of DeiT-S and DeiT-B, even though the ImageNet results were the same.
% This demonstrates that the improved representations from training on \name translate to superior performance beyond gains from better ImageNet performance.
Notably, training with \schemename boosts the downstream performance of DeiT-S and DeiT-B, despite similar ImageNet results.
This shows the improved representations from training with \schemename translate to gains beyond better ImageNet scores.
% not only on ImageNet, but also on fine-grained image classification tasks.
\subsection{Bias and Robustness Evaluation}
% Additional to just using \name for training, its special properties and posibilities for adjustment of the data distribution make it a valuable tool for evaluating other model properties and biases.
Beyond its use for training, \schemename's unique properties and controlled data generation capabilities make it a powerful tool for analyzing behavior and biases of black-box models.
\begin{figure*}
\centering
\includegraphics[width=.95\textwidth]{img/bg_robustness.pdf}
\caption{Evaluation of background robustness on ImageNet + \schemename, ImageNet9 and CounterAnimal.
We plot the in-distribution (top of arrow) and the out-of-distribution (bottom of arrow) accuracy when training with and without \schemename.
We annotate each arrow with its length $\Delta$.
Training with \schemename improves the background robustness of all transformers by mostly boosting the out-of-distribution accuracy.
}
\label{fig:background-robustness}
\end{figure*}
\textbf{Background Robustness.}
% By adjusting the background distribution from using a background from an image of the same class as the foreground to using any background, we can evaluate the robustness of models to shifts in the background distribution.
% We assess background robustness by changing the background distribution, comparing accuracy with backgrounds of the same class as the foreground to using any background.
We assess the robustness of models to shifts in the background distribution from a class-related background to any background.
% We define the background robustness coefficient to be the accuracy of a model on \name when using the same class background divided by the accuracy when using any background:
Background robustness is defined to be the ratio of accuracy on \name with same-class backgrounds to accuracy with any background:
\begin{align}
\text{Background Robustness} = \frac{\text{Acc}(\name_\text{all})}{\text{Acc}(\name_\text{same})}
\end{align}
It represents the relative drop in performance under a background distribution shift.
\Cref{tab:background-robustness} presents the background robustness of various models.
When trained on ImageNet, smaller models generally exhibit greater robustness to changes in the background distribution than larger models and ResNet is more robust than the tested Transformer models.
Crucially, training on \name instead of ImageNet improves the background robustness of all models to $\approx1.00$, meaning that these models are agnostic to the choice of background and only classify based on the foreground.
These findings highlight the generalization benefits of \name.
% Background robustness is defined to be the ratio of accuracy on \name with same-class backgrounds to accuracy with any background:
% \begin{align}
% \text{Background Robustness} = \frac{\text{Acc}(\name_\text{all})}{\text{Acc}(\name_\text{same})}
% \end{align}
% It represents the relative drop in performance under a background distribution shift.
\Cref{fig:background-robustness} presents the background robustness results for three datasets: ImageNet with \schemename (all backgrounds vs. backgrounds of same class), ImageNet9 \cite{Xiao2020} (random backgrounds vs. original backgrounds), and CounterAnimal \cite{Wang2024f} (counter vs. common background).
The top triangle of each arrow represents the in-distribution backgrounds and the bottom triangle represents the out-of-distribution ones.
We follow ImageNet9 and CounterAnimal and assess the background robustness in terms of the accuracy gap when evaluating a model on images of normal background distribution compared to out-of-distribution backgrounds (length of each arrow; $\Delta$).
% When trained on ImageNet, smaller models generally exhibit greater robustness to changes in the background distribution than larger models and ResNet is more robust than the tested Transformer models.
Crucially, \schemename improves the background robustness of all models and across datasets, reducing the background-gap by boosting the performance on the out-of-background-distribution samples more than the in-distribution ones.
% to $\approx1.00$, meaning that these models are agnostic to the choice of background and only classify based on the foreground.
These findings highlight the generalization benefits of \schemename to unusual image compositions.
\paragraph*{Foreground Focus}
\begin{table}
\begin{figure*}
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{lcccccc}
\toprule
\multirow{4}{*}{Model} & \multicolumn{6}{c}{Foreground Focus when trained on} \\
\cmidrule(l){2-7}
& IN & FN & IN & FN & IN & FN \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(l){6-7}
& \multicolumn{2}{c}{GradCam} & \multicolumn{2}{c}{GradCam++} & \multicolumn{2}{c}{IG} \\
\midrule
ViT-S & $1.2\pm0.1$ & $2.3\pm0.3$ & $1.2\pm0.1$ & $2.1\pm0.4$ & $1.9\pm0.1$ & $2.7\pm0.1$ \\
ViT-B & $1.2\pm0.1$ & $2.4\pm0.7$ & $1.1\pm0.1$ & $2.1\pm0.1$ & $1.7\pm0.1$ & $2.7\pm0.1$ \\
ViT-L & $1.3\pm0.1$ & $1.6\pm0.1$ & $1.1\pm0.1$ & $1.3\pm0.1$ & $1.3\pm0.1$ & $2.6\pm0.1$ \\
\midrule
Swin-Ti & $0.9\pm0.1$ & $0.7\pm0.1$ & $1.0\pm0.3$ & $0.7\pm0.3$ & $2.5\pm01$ & $4.8\pm0.3$ \\
Swin-S & $0.8\pm0.1$ & $0.7\pm0.1$ & $0.7\pm0.1$ & $0.7\pm0.4$ & $2.4\pm0.1$ & $4.6\pm0.3$ \\
\midrule
ResNet-50 & $2.2\pm0.1$ & $2.7\pm0.1$ & $2.0\pm0.1$ & $2.9\pm0.1$ & $3.2\pm0.1$ & $4.9\pm0.2$ \\
ResNet-101 & $2.3\pm0.1$ & $2.8\pm0.1$ & $2.2\pm0.1$ & $3.0\pm0.1$ & $3.2\pm0.1$ & $4.8\pm0.1$ \\
\bottomrule
\end{tabular}}
\caption{Evaluation of the foreground focus using GradCam, GradCam++ and IntegratedGradients of models trained on \name (FN) and on ImageNet (IN) directly. Training on \name improves the foreground focus of almost all models.}
\label{tab:foreground-focus}
\end{table}
\includegraphics[width=.95\textwidth]{img/fg_focus.pdf}
\caption{Evaluation of the foreground focus (\Cref{eq:fg-focus}) using GradCam, GradCam++ and IntegratedGradients (IG) of models trained on ImageNet. Training with \schemename improves the foreground focus of almost all models.}
\label{fig:foreground-focus}
\end{figure*}
Leveraging our inherent knowledge of the foreground masks when using \name, as well as common XAI techniques~\cite{Selvaraju2016,Chattopadhay2018,Sundararajan2017}, we can evaluate a model's focus on the foreground object.
We can directly evaluate ImageNet trained models, but this technique can also be extended to other datasets without relying on manually annotated foreground-masks.
To evaluate the foreground focus, we employ Grad-CAM \cite{Selvaraju2016}, Grad-CAM++ \cite{Chattopadhay2018} or IntegratedGradients (IG) \cite{Sundararajan2017} to compute the per-pixel importance of an image for the model's prediction.
\textbf{Foreground Focus.}
Leveraging our inherent knowledge of the foreground masks when using \schemename, as well as common XAI techniques~\cite{Selvaraju2016,Chattopadhay2018,Sundararajan2017}, we can evaluate a model's focus on the foreground object.
% I.e. we measure how much the model's decision depends on the foreground.
We can directly evaluate ImageNet-trained models, but this technique can also be extended to other datasets without relying on manually annotated foreground masks.
To evaluate the foreground focus, we employ Grad-CAM \cite{Selvaraju2016}, Grad-CAM++ \cite{Chattopadhay2018} and IntegratedGradients (IG) \cite{Sundararajan2017} to compute the per-pixel importance of an image for the model's prediction.
The foreground focus is defined to be the ratio of the foreground's relative importance to its relative size in the image:
\begin{align}
\begin{align} \label{eq:fg-focus}
\text{FG Focus}(\text{img}) = \frac{\text{Area}(\text{img}) \hspace{3pt} \text{Importance}(\text{fg})}{\text{Area}(\text{fg}) \hspace{3pt} \text{Importance}(\text{img})}
\end{align}
The foreground focus of a model is its average foreground focus over all test images.
\Cref{tab:foreground-focus} presents our findings.
Training on \name significantly increasees the foreground focus of ViT and ResNet across all metrics used.
For Swin, the foreground focus stagnates when measured using GradCam and GradCam++, but almost doubles when using IG.
If all pixels uniformly receive the same importance value, the foreground focus is one.
The foreground focus of a model is its average focus over all test images.
\Cref{fig:foreground-focus} presents our findings.
Using \schemename significantly increases the foreground focus of ViT, DeiT and ResNet across all XAI metrics.
% I.e. \schemename-trained models base their decision more on the foreground object compared to the background than models trained without \schemename.
% For Swin, the foreground focus stagnates when measured using GradCam and GradCam++, but almost doubles when using IG.
% We hypothesize that Swin's below-uniform foreground focus reported with GradCam is due to its specific implementation for Swin.
We hypothesize Swin's below-uniform foreground focus with GradCam is due to its specific implementation.
% These differences might be due to the way GradCam is calculated for Swin \todo{cite package website where this is from} and the \todo{common critique of GradCam}.
\paragraph*{Center Bias}
\begin{table}
\begin{table}[t]
\caption{
% Evaluation of the center bias.
Accuracy relative to the center accuracy of multiple instantiations of the models when the foreground objects is in different cells of a $3 \times 3$ grid.
We calculate center bias according to \Cref{eq:center-bias}.
Using \schemename significantly reduces models' center bias.}
\label{tab:center-bias}
\centering
\resizebox{\columnwidth}{!}{
\resizebox{.78\columnwidth}{!}{
\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias when trained on}} & \multirow{2.5}{*}{Delta} \\
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& ImageNet & \name \\
& w/o \schemename & w/ \schemename \\
\midrule
ViT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNetAll_v3.pdf}} \\
& $0.255\pm0.008$ & $0.220\pm003$ & \grntxt{-0.035} \\
ViT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNetAll_v3.pdf}} \\
& $0.254\pm0.004$ & $0.190\pm0.002$ & \grntxt{-0.064} \\
ViT-L & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNetAll_v3.pdf}} \\
& $0.243\pm0.011$ & $0.117\pm0.007$ & \grntxt{-0.126} \\
ViT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v3.pdf}} \\
& $25.5\pm0.8$ & $22.0\pm0.3$ & \grntxt{$-3.5$} \\
ViT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v3.pdf}} \\
& $25.4\pm0.4$ & $19.0\pm0.2$ & \grntxt{$-6.4$} \\
ViT-L & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v3.pdf}} \\
& $24.3\pm1.1$ & $11.7\pm0.7$ & \grntxt{$-12.6$} \\
\midrule
Swin-Ti & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNetAll_v3.pdf}} \\
& $0.250\pm0.007$ & $0.165\pm0.002$ & \grntxt{-0.085} \\
Swin-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNetAll_v3.pdf}} \\
& $0.232\pm0.001$ & $0.156\pm002$ & \grntxt{-0.076} \\
DeiT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v3.pdf}} \\
& $20.4 \pm 0.2$ & $21.2 \pm 0.1$ & \gtxt{$+0.8$} \\
DeiT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v3.pdf}} \\
& $19.0 \pm 0.7$ & $19.0 \pm 0.2$ & \gtxt{$\pm0.0$} \\
DeiT-L & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v3.pdf} } & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v3.pdf} } \\
& $21.2 \pm 0.2$ & $18.0 \pm 0.2$ & \grntxt{$-3.2$} \\
\midrule
ResNet50 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNetAll_v3.pdf}} \\
& $0.263\pm0.003$ & $0.197\pm0.003$ & \grntxt{-0.066} \\
ResNet101 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNetAll_v3.pdf}} \\
& $0.230\pm0.003$ & $0.199\pm002$ & \grntxt{-0.031} \\
Swin-Ti & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v3.pdf}} \\
& $25.0\pm0.7$ & $16.5\pm0.2$ & \grntxt{$-8.5$} \\
Swin-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v3.pdf}} \\
& $23.2\pm0.1$ & $15.6\pm0.2$ & \grntxt{$-7.6$} \\
\midrule
ResNet50 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v3.pdf}} \\
& $26.3\pm0.3$ & $19.7\pm0.3$ & \grntxt{$-6.6$} \\
ResNet101 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v3.pdf}} \\
& $23.0\pm0.3$ & $19.9\pm0.2$ & \grntxt{$-3.1$} \\
\bottomrule
\end{tabular} }
\includegraphics[width=.75\columnwidth]{img/colorbar_horizontal.pdf}
\caption{Evaluation of the position bias. We plot the accuracy relative to the center accuracy of multiple instantiations of the models when the foreground objects is in different cells a $3 \times 3$ grid.
Training on \name significantly reduces a models center bias.}
\label{tab:center-bias}
\includegraphics[width=.8\columnwidth]{img/colorbar_horizontal.pdf}
\end{table}
With \name we have unique control over the position of the foreground object in the image.
This lets us quantify the center bias of ImageNet- and \name-trained models.
We divide the image into a $3 \times 3$ grid and evaluate model accuracy when the foreground object is in each of the $9$ grid cells.
\textbf{Center Bias.}
With \schemename we have unique control over the position of the foreground object in the image.
This lets us quantify the center bias of models trained with and without \schemename.
We divide the image into a $3 \times 3$ grid and evaluate model accuracy when the (scaled-down) foreground object is in each of the $9$ grid cells.
Each cell's accuracy is divided by the accuracy in the center cell for normalization, which gives us the relative performance drop when the foreground is in each part of the image.
The center bias is calculated as one minus the average of the minimum performance of a corner cell and the minimum performance of a side cell:
\begin{align}
\begin{split}
& \text{Center Bias} = \\
& \hspace{7pt} 1 - \frac{\min\limits_{a, b \in \{0, 2\}} \text{Acc}(\text{cell}_{(a, b)}) + \min\limits_{\substack{a=1 \text{ or } b=1 \\ a \neq b}} \text{Acc}(\text{cell}_{(a, b)})}{2 \text{Acc}(\text{cell}_{(1, 1)})}
\end{split}
% \begin{align}
% \begin{split}
% & \text{Center Bias} = \\
% & \hspace{7pt} 1 - \frac{\min\limits_{a, b \in \{0, 2\}} \text{Acc}(\text{cell}_{(a, b)}) + \min\limits_{\substack{a=1 \text{ or } b=1 \\ a \neq b}} \text{Acc}(\text{cell}_{(a, b)})}{2 \text{Acc}(\text{cell}_{(1, 1)})}
% \end{split}
% \end{align}
\begin{align} \label{eq:center-bias}
\text{Center Bias} = 1 - \frac{\min\limits_{c \in \text{sides}} \text{Acc}(c) + \min\limits_{c \in \text{corners}} \text{Acc}(c)}{2 \text{Acc}(c_\text{center})}
\end{align}
\Cref{tab:center-bias} visualizes the center bias of three instantiations of each model.
Performance is generally highest in the center and the center top and bottom and center left and right cells, and lowest in the four corners.
Performance is generally highest in the center and lowest in the four corners.
Interestingly, ImageNet-trained models perform slightly better when the foreground object is on the right side of the image, compared to the left side, despite our use of random flipping with a probability of $0.5$ during training.
% Training on \name reduces the center bias of all models by at least half.
Training on \name significantly reduces center bias across all models.
This demonstrates that \name promotes a more uniform spatial attention distribution.
Their accuracy is higher in the center left and right cells than in the center top and bottom ones, which is not the case for ImageNet-trained models.
Using \schemename significantly reduces center bias across models, with a more uniform performance especially across the middle row.
% Their accuracy is higher in the center left and right cells than in the center top and bottom ones, which is not the case for ImageNet-trained models.
% This demonstrates that \schemename promotes a more uniform spatial attention distribution, counteracting the center-bias of ImageNet.
Thus, \schemename makes the model recognize objects across a wider spatial distribution, counteracting the center-bias of ImageNet.
\paragraph*{Size Bias}
\begin{figure}
\begin{figure}[t!]
\centering
\includegraphics[width=.9\columnwidth]{img/size_bias.pdf}
\caption{Evaluation of the size bias of models trained on \name. We plot the accuracy relative to the accuracy when using the mean foreground size.}
\includegraphics[width=\columnwidth]{img/size_bias_grid.pdf}
\caption{Evaluation of the size bias of models trained on ImageNet. We plot the accuracy relative to the accuracy when using the default size ($f_\text{size} = 1.0$).}
\label{fig:size-bias}
\end{figure}
Finally, we evaluate the impact of different-sized foreground objects on the accuracy.
\textbf{Size Bias.}
Finally, we evaluate the impact of different sized foreground objects on the accuracy.
For this evaluation, we use the \emph{mean} foreground size strategy.
We introduce a size factor $f_\text{size}$ by which we additionally scale the foreground object before pasting it onto the background.
Results are again normalized by the accuracy when using the mean foreground size ($f_\text{size} = 1.0$).
\Cref{fig:size-bias} shows the size bias curves of ViT-S and ViT-B when trained on ImageNet and \name.
Results are normalized by the accuracy when using $f_\text{size} = 1.0$.
\Cref{fig:size-bias} shows the size bias curves of models trained with and without \schemename.
% When training on \name, the resulting model keeps it's good performance on smaller foreground objects, while models trained on ImageNet fall of faster and lower.
Models trained on \name maintain better performance even with smaller foreground objects, when ImageNet-trained models exhibit a more rapid performance decline.
Therefore, \name-training improves robustness to variations in object scale.
Models trained using \schemename maintain perform better, especially with smaller foreground objects.
%, when ImageNet-trained models exhibit a more rapid performance decline.
Therefore, \schemename-training improves robustness to variations in object scale, especially for larger models.

View File

@@ -15,46 +15,59 @@
\begin{figure}
\centering
\includegraphics[width=\columnwidth]{img/fig-1.pdf}
\caption{Comparison of \name and ImageNet. \name recombines foreground objects with different backgrounds each epoch, thus creating a more diverse training set. We still apply traditional data augmentation afterwards.}
\caption{Comparison of traditional image classification training and training when using \schemename. \schemename recombines foreground objects with different backgrounds each epoch, thus creating a more diverse training set. We still apply strong traditional data augmentation afterwards.}
\label{fig:fig-1}
\end{figure}
Image classification, a fundamental task in computer vision (CV), involves assigning a label to an image from a predefined set of categories.
This seemingly simple task underpins a wide range of applications, including medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2022b}, and object recognition~\cite{Carion2020,He2017,Girshick2013}.
Furthermore, image classification is used for large-scale pretraining of vision models~\cite{Dosovitskiy2021,Liu2021,Touvron2021b} and to judge the progress of the field of CV \cite{Khan2022, Rangel2024}.
The advent of large-scale datasets, particularly ImageNet \cite{Deng2009}, containing millions of labeled images across thousands of categories, has been instrumental in driving significant progress in this field.
ImageNet served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}.
Image classification, a fundamental task in computer vision (CV), involves assigning labels to images from a set of categories.
It underpins a wide range of applications, like medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2022b}, and object recognition~\cite{Carion2020,He2017,Girshick2013} and facilitates large-scale pretraining~\cite{Dosovitskiy2021,Liu2021,Touvron2021b}, and progress evaluation in CV~\cite{Khan2022, Rangel2024}.
% Furthermore, image classification is used for large-scale pretraining of vision models~\cite{Dosovitskiy2021,Liu2021,Touvron2021b} and to judge the progress of the field of CV \cite{Khan2022, Rangel2024}.
The advent of large-scale datasets, particularly ImageNet~\cite{Deng2009}, served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}.
% containing millions of labeled images across thousands of categories, has been instrumental in driving significant progress in this field.
% ImageNet served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}.
% It is used to train and evaluate the best models in the field.
While traditionally, convolutional neural networks (CNNs) have been the go-to architecture in CV, Transformers \cite{Vaswani2017}, particularly the Vision Transformer (ViT) \cite{Dosovitskiy2021}, have emerged as a powerful alternative and go-to architecture, demonstrating
% These attention-based models have demonstrated
superior performance in various vision tasks, including image classification \cite{Wortsman2022,Yu2022,Carion2020,Zong2022,Wang2022a}.
While traditionally, convolutional neural networks (CNNs) have been the go-to architecture for image classification, Transformers \cite{Vaswani2017}, particularly the Vision Transformer (ViT) \cite{Dosovitskiy2021}, have emerged as a powerful alternative.
These attention-based models have demonstrated superior performance in various vision tasks, including image classification \cite{Wortsman2022,Yu2022,Carion2020,Zong2022,Wang2022a}.
Data augmentation is a key technique for training image classification models.
% A key technique for training image classification models, especially with limited data, is data augmentation.
Traditional data augmentation methods, such as random cropping, flipping, and color jittering, are commonly employed to increase the diversity of the training data and improve the model's performance~\cite{Xu2023d, Shorten2019}.
These basic transformations, originally designed for CNNs, change the input images in a way that preserves their semantic meaning~\cite{Alomar2023}.
Traditional augmentation methods, such as cropping, flipping, or color shifts, are commonly employed to increase data diversity~\cite{Xu2023d, Shorten2019}, but remain bound to existing image compositions.
While these preserve the images' semantic meaning, their ability to teach spatial invariances is limited.
% the diversity of the training data and improve the model's performance~\cite{Xu2023d, Shorten2019}.
% These basic transformations, originally designed for CNNs, change the input images in a way that preserves their semantic meaning~\cite{Alomar2023}, but are limited to existing image compositions.
While combinations of these data augmentations are still used today, they originally were proposed to benefit CNNs.
However, the architectural differences of CNNs and Transformers suggest that the latter might benefit from different data augmentation strategies.
In particular, the Transformers self-attention mechanism is not translation equivariant~\cite{RojasGomez2023,Ding2023a}, meaning that the model does not inherently understand the spatial relationships between pixels.
In particular, the self-attention mechanism, unlike a CNN, is not translation equivariant~\cite{RojasGomez2023,Ding2023a}, meaning that the model is not designed to understand the spatial relationships between pixels.
% This creates the need for novel data augmentation strategies tailored to the Transformer architecture.
% This fact opens a new design space for data augmentation strategies to help Transformers understand the basic invariances of image classification.
% Note that these traditional data augmentations are also limited by existing image compositions.
Inspired by this inductive bias of CNNs, that is not inherent to ViTs, we propose \schemename, a novel data augmentation scheme for image classification which makes the translation equivariance of CNNs explicit in the training data by recombining foreground objects at varying positions with different backgrounds.
Recognizing that Transformers need to learn spatial relationships directly from data,
% and in general are usually trained on larger datasets~\cite{Kolesnikov2020},
we propose \schemename, a data augmentation method that makes these relationships explicit by recombining foreground objects with diverse backgrounds.
Thus, \schemename goes beyond existing image compositions and encodes desired invariances directly into the training data (see \Cref{fig:fig-1}).
% Inspired by this inductive bias of CNNs, that is not inherent to ViTs, we propose \schemename, a novel data augmentation scheme for image classification which makes the translation equivariance of CNNs explicit in the training data by recombining foreground objects at varying positions with different backgrounds.
% In this paper, we address the challenge of effectively training Transformers for image classification by proposing \schemename, a novel data augmentation scheme for image classification, which combines foreground objects with different backgrounds.
Applying \schemename to ImageNet gives rise to \name, a novel dataset that enables this data augmentation with with fine-grained control over the image composition.
Recognizing that Transformers need to learn the spatial relationships from data, since they are not inherently translation invariant, and in general are usually trained on larger datasets~\cite{Kolesnikov2020}, we separate the foreground objects in ImageNet from their backgrounds, using an open-world object detector~\cite{Ren2024}, and fill in the background in a plausible way using an object removal model~\cite{Sun2024,Suvorov2021}.
This allows us to recombine any foreground object with any background on the fly, creating a highly diverse training set.
During recombination, we can control important parameters, like the size and position of the foreground object, to help the model learn the spatial invariances necessary for image classification.
We show that training on \name instead of ImageNet increases the model accuracy of Transformers by up to 4.5 p.p. on ImageNet and an up to $39.3\%$ reduction in error rate on downstream tasks.
% Applying \schemename to ImageNet gives rise to \name, a novel dataset that enables this data augmentation with with fine-grained control over the image composition.
Applying \schemename to a dataset like ImageNet is a two-step process:
(1)~We separate the foreground objects in ImageNet from their backgrounds, using an open-world object detector~\cite{Ren2024} and fill in the background in a neutral way using an object removal model~\cite{Sun2024,Suvorov2021}.
(2)~This allows us to then recombine any foreground object with any background on the fly, creating a highly diverse training set.
% During recombination, we can control important parameters, like the size and position of the foreground object, to help the model learn the spatial invariances necessary for image classification.
By exploiting the control over foreground size and position during recombination, \schemename explicitly teaches spatial invariances that image classification models typically must learn implicitly.
We show that using \schemename additionally to strong traditional data augmentation increases the model accuracy of Transformers by up to 4.5 p.p. on ImageNet and reduces the error rate by up to $7.3$ p.p. in downstream tasks.
Additionally, \schemename is a useful tool for analyzing model behavior and biases, when used during the evaluation phase.
We utilize our control over the image distribution to quantify a model's background robustness (by varying the choice of background), foreground focus (by leveraging our knowledge about the placement of the foreground object), center bias (by controlling the object's position), and size bias (by controlling object size).
These analyses provide insights into model behavior and biases, which is crucial for model deployment and future robustness optimizations.
We show that training on \name, instead of ImageNet, significantly reduces all of these biases, completely removing the models' dependence on the background distribution.
We make our code for \schemename and the \name-dataset publicly available\footnote{\url{https://github.com/tobna/ForAug}} to facilitate further research.
Beyond training, \schemename becomes a diagnostic tool for analyzing model behavior and biases, when used during evaluation.
We utilize our control over the image distribution to measure a model's background robustness (by varying the choice of background), foreground focus (by leveraging our knowledge about the placement of the foreground object), center bias (by controlling position), and size bias (by controlling size).
These analyses provide valuable insights into model behavior and biases, which is crucial for model deployment and future robustness optimizations.
We show that training using \schemename significantly reduces all of these biases.
We make our code for \schemename and the output of \schemename's segmentation phase on ImageNet publicly available\footnote{Link will go here.} to facilitate further research.
\subsection*{Contributions}
\begin{itemize}
\item We propose \schemename, a novel data augmentation scheme, that recombines objects and backgrounds to train Transformers for image classification.
\item We show that training on \name, the ImageNet instantiation of \schemename, leads to 4.5 p.p. improved accuracy on ImageNet and 7.3 p.p. on downstream tasks.
\item We propose novel \schemename-based metrics to analyze and quantify fine-grained biases trained models: Background Robustness, Foreground Focus, Center Bias, and Size Bias. Training on \name, instead of ImageNet, significantly reduces these biases.
\item We propose \schemename, a novel data augmentation scheme, that recombines objects and backgrounds. \schemename allows us to move beyond the (possibly biased) image compositions in the dataset while preserving label integrity.
\item We show that training a standard ViT using \schemename leads to up to 4.5 p.p. improved accuracy on ImageNet-1k and 7.3 p.p. on downstream tasks.
\item We propose novel \schemename-based metrics to analyze and quantify fine-grained biases of trained models: Background Robustness, Foreground Focus, Center Bias, and Size Bias. We show that \schemename significantly reduces these biases by encoding invariance that benefits ViT into the training data.
\end{itemize}

View File

@@ -1,6 +1,13 @@
% !TeX root = ../main.tex
\section{RecombiNet (Method)}
%\begin{figure*}[ht!]
% \centering
% \includegraphics[width=.9\textwidth]{img/fig-2.pdf}
% \caption{Overview of \name. The data creation consists of two stages: (1, offline) Segmentation, where we segment the foreground objects from the background and fill in the background. (2, online) Recombination, where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.}
% \label{fig:method}
%\end{figure*}
\section{\schemename (Method)}
\label{sec:method}
% \begin{itemize}
@@ -19,21 +26,21 @@
% \item Dealing with other data augmentations/transformations
% \end{itemize}
\begin{figure*}
\centering
\includegraphics[width=\textwidth]{img/fig-2.pdf}
\caption{Overview of \name. The data creation consists of two stages: (1, offline) Segmentation, where we segment the foreground objects from the background and fill in the background. (2, online) Recombination, where we combine the foreground objects with different backgrounds to create new samples.}
\label{fig:method}
\end{figure*}
% We propose a novel dataset, called \name, that improves image classification performance by explicitly separating and recombining foreground objects and plain backgrounds.
% \name consists of two stages: Segmentation and recombination. Both are visualized in \Cref{fig:method}.
We introduce \schemename, a data augmentation scheme designed to enhance Transformer training by explicitly separating and recombining foreground objects and backgrounds.
\schemename involves two stages: Segmentation and Recombination, both visualized in \Cref{fig:method}.
% We introduce \schemename, a data augmentation scheme designed to enhance Transformer training by explicitly separating and recombining foreground objects and backgrounds.
% \schemename enhances transformer training by explicitly encoding spatial invariances that these need to learn explicitly in the data.
% \schemename involves two stages: Segmentation and Recombination, both visualized in \Cref{fig:method}.
We introduce \schemename, a data augmentation designed to enhance Transformer training by embedding spatial invariances--which Transformers would otherwise need to learn implicitly--directly into the training data.
% It operates by explicitly segmenting and recombining foreground objects and backgrounds.
\schemename comprises two distinct stages: Segmentation and Recombination. Both stages are illustrated in \Cref{fig:method}.
\subsubsection*{Segmentation}
\subsection{Segmentation}
\label{sec:segmentation}
The segmentation stage isolates the foreground objects and their corresponding backgrounds.
We then fill in the background in a visually plausible way~\cite{Sun2024} using a pretrained object-removal model.
% We then fill in the background in a visually plausible way~\cite{Sun2024} using a pretrained object-removal model.
We then fill the background using a pretrained object-removal model, producing visually plausible~\cite{Sun2024}, neutral scenes ready for recombination.
This stage is computed once offline and the results are stored for the recombination stage.
First, foreground objects are detected and segmented from their backgrounds using a prompt-based segmentation model to exploit the classification datasets labels.
@@ -43,32 +50,39 @@ The \code{<object category>} guides the segmentation model towards the correct o
This can be the case with prompts like ``sorrel'' or ``guenon'', where the more general name ``horse'' or ``monkey'' is more helpful.
We derive the \code{<object category>} from the WordNet hierarchy, using the immediate hypernym.
We iteratively extract up to $n$ foreground masks for each dataset-image, using different more and more general prompts based on the more general synsets of WordNet (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...).
% We iteratively extract up to $n$ foreground masks for each dataset-image, using different more and more general prompts based on the more general synsets of WordNet (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...).
We iteratively extract $n$ foreground masks for each dataset-image, creating prompts by going one hypernym up the WordNet-tree each step (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...).
Masks that are very similar, with a pairwise IoU of at least $0.9$, are merged.
The output is a set of masks delineating the foreground objects and the backgrounds.
We select the best mask per image (according to \Cref{eq:filtering-score}) in a later filtering step, described below.
An inpainting model that is specifically optimized to remove objects from images, such as LaMa~\cite{Suvorov2021} or Attentive Eraser~\cite{Sun2024}, is used to inpaint the foreground regions in the backgrounds.
To ensure the quality of the foreground and background images (for each dataset-image), we select a foreground/background pair from the $\leq n$ variants we have extracted and infilled in the previous steps.
Using an ensemble of six ViT, ResNet, and Swin Transformer models pretrained on the original dataset, we select the foreground/background pair that maximizes foreground performance while minimizing the performance on the background and size of the foreground according to:
First, an inpainting model that is specifically optimized to remove objects from images, such as LaMa~\cite{Suvorov2021} or Attentive Eraser~\cite{Sun2024}, is used to inpaint the foreground regions in the backgrounds.
Then, to ensure the quality of the foregrounds and the neutral background images, we select a foreground/background pair (for each dataset-image) from the $\leq n$ variants we have extracted and infilled in the previous steps.
Using an ensemble $E$ of six ViT, ResNet, and Swin Transformer models pretrained on the original dataset, we select the foreground/background pair that maximizes foreground performance while minimizing the performance on the background and size of the foreground.
For each model $m \in E$, we predict the score of the ground truth class $c$ on the foreground $\mathrm{fg}$ and background $\mathrm{bg}$ and weigh these with the size $\operatorname{size}(\cdot)$ in number of pixels according to:
% $c$ is the correct foreground class, $\mathrm{fg}$, and $\mathrm{bg}$ are the foreground and background and $\operatorname{size}(\cdot)$ is the size in number of pixels.
\begin{align} \begin{split} \label{eq:filtering-score}
\text{score}(\mathrm{fg}, \mathrm{bg}, c) &= \log \left( \frac{1}{\abs{E}} \sum_{m \in E} \P[m(\mathrm{fg}) = c] \right) \\
& + \log \left( 1 - \frac{1}{\abs E} \sum_{m \in E} \P[m(\mathrm{bg}) = c] \right) \\
& + \lambda \log \left( 1 - \abs{\frac{\operatorname{size}(\mathrm{fg})}{\operatorname{size}(\mathrm{bg})} - \eps} \right).
\end{split} \end{align}
Here, $E$ is the ensemble of models and $m$ is a pretrained model, $c$ is the correct foreground class, $\mathrm{fg}$, and $\mathrm{bg}$ are the foreground and background and $\operatorname{size}(\cdot)$ is the size in number of pixels.
We ran a hyperparameter search using a manually annotated subset of foreground/background variants to find the factors in \Cref{eq:filtering-score}: $\lambda = 2$ and $\eps = 0.1$.
The \textit{optimal foreground size} of $10\%$ of the full image balances the smallest possible foreground size that encompasses all the respective class information in the image with still conveying the foreground information after pasting it onto another background.
This filtering step ensures we segment all the relevant foreground objects.
% We use $E$ is the ensemble of models and $m$ is a pretrained model, $c$ is the correct foreground class, $\mathrm{fg}$, and $\mathrm{bg}$ are the foreground and background and $\operatorname{size}(\cdot)$ is the size in number of pixels.
We run a hyperparameter search using a manually annotated subset of foreground/background variants to find the factors in \Cref{eq:filtering-score}: $\lambda = 2$ and $\eps = 0.1$.
% The \textit{optimal foreground size} of $10\%$ of the full image balances the smallest possible foreground size that encompasses all the respective class information in the image with still conveying the foreground information after pasting it onto another background.
% This filtering step ensures we segment all the relevant foreground objects.
Finally, we filter out backgrounds that are more than $80\%$ infilled, as these tend to be overly synthetic, plain and don't carry much information (see \Cref{sec:high-infill-ratio}).
We ablate this choice in \Cref{sec:ablation}.
Finally, we filter out backgrounds that are largely infilled, as these tend to be overly synthetic and do not carry much information (see the supplementary material).
% We ablate this choice in \Cref{sec:ablation}.
% While the computational cost for the segmentation stage is significant, this is a one-time calculation whose results can be reused in subsequent experiments (see the supplementary material for details).
Although the segmentation stage is computational overhead, it is a one-time cost with results that can be reused across experiments (see the supplementary material for details).
In summary, we factorize the dataset into a set of foreground objects with a transparent background and a set of diverse backgrounds per class.
The next step is to recombine them as data augmentation before applying common data augmentation operations during training.
The next step is to recombine these, before applying other common data augmentation operations during training.
\subsubsection*{Recombination}
The recombination stage, which is performed online, combines the foreground objects with different backgrounds to create new training samples.
For each object, we follow the pipeline of: Pick an appropriate background, resize it to a fitting size, place it in the background image, smooth the transition edge, and apply other data augmentations.
\subsection{Recombination}
\label{sec:recombination}
The recombination stage, performed online during training, combines the foreground objects with different backgrounds to create new training samples.
For each object, we follow the pipeline of: Pick an appropriate background, resize it to a fitting size, and place it in the background image.
Through this step, we expose the model to variations beyond the image compositions of the dataset.
For each foreground object, we sample a background using one of the following strategies:
(1) the original image background, (2) the set of backgrounds from the same class, or (3) the set of all possible backgrounds.
@@ -76,26 +90,24 @@ These sets are trading off the amount of information the model can learn from th
In each epoch, each foreground object is seen exactly once, but a background may appear multiple times.
The selected foreground is resized based on its relative size within its original image and the relative size of the original foreground in the selected background image.
The final size is randomly selected from a 30\% range around upper and lower limits ($s_u$ and $s_l$), based on the original sizes:
\begin{align}
s \sim \mathcal U \left[ (1 - 0.3) s_l, (1 + 0.3) s_u \right].
\end{align}
The final size is randomly selected from a 30\% range around upper and lower limits ($s_u$ and $s_l$), based on the original sizes.
% \begin{align}
% s \sim \mathcal U \left[ (1 - 0.3) s_l, (1 + 0.3) s_u \right].
% \end{align}
To balance the size of the foreground and that of the backgrounds original foreground, the upper and lower limit $s_u$ and $s_l$ are set to the mean or range of both sizes, depending on the foreground size strategy: \emph{mean} or \emph{range}.
The resized foreground is then placed at a random position within the background image.
This position is sampled from a generalization of the Bates distribution~\cite{Bates1955} with parameter $\eta \in \N$, visualized in \Cref{fig:bates-pdf}.
We choose the bates distribution, as it presents an easy way to sample from a bounded domain with just one hyperparameter that controls the concentration of the distribution.
$\eta = 1$ corresponds to the uniform distribution; $\eta > 1$ concentrates the distribution around the center; and for $\eta < -1$, the distribution is concentrated at the borders.
To more seamlessly integrate the foreground, we apply a Gaussian blur with ${\sigma \in [\frac{\sigma_{\text{max}}}{10}, \sigma_{\text{max}}]}$, inspired by the standard range for the Gaussian blur operation in \cite{Touvron2022}, to the foreground's alpha-mask.
We can apply standard data augmentation techniques in two modes:
Either we apply all augmentations to the recombined image, or we apply the cropping and resizing to the background only and then apply the other augmentations after recombination.
% While for the second mode, the foreground object will always be fully visible, the first mode uses the data augmentations in the same way they would be used for the baseline dataset.
The second mode ensures the foreground object remains fully visible, while the first mode mirrors standard data augmentation practices.
% The second mode ensures the foreground object remains fully visible, while the first mode mirrors standard data augmentation practices.
The first mode mirrors standard augmentation practice, whereas the second one ensures the foreground object remains fully visible.
We experiment with a constant mixing ratio, or a linear or cosine annealing schedule that increases the amount of images from the original dataset over time.
The mixing ratio acts as a probability of selecting an image from the original dataset;
otherwise, an image with the same foreground is recombined using \schemename.
Thus, we still ensure each foreground is seen once per epoch.
otherwise, an image with the same foreground is recombined using \schemename, ensuring each object is seen once per epoch.
% Thus, we still ensure each foreground is seen once per epoch.
The recombination stage is designed to be parallelized on the CPU during training and thus does not impact training time (see supplementary material for details).

View File

@@ -6,23 +6,33 @@
\paragraph{Data Augmentation for Image Classification}
Data augmentation is a crucial technique for improving the performance and generalization of image classification models.
Traditional augmentation strategies rely on simple geometric or color-space transformations like cropping, flipping, roatation, blurring, color jittering, or random erasing \cite{Zhong2017} to increase the diversity of the training data without changing their semantic meaning.
With the advent of Transformers, new data augmentation operations like PatchDropout \cite{Liu2022d} have been proposed.
With the advent of Vision Transformers, new data augmentation operations like PatchDropout \cite{Liu2022d} have been proposed.
Other transformations like Mixup \cite{Zhang2018a}, CutMix \cite{Yun2019}, or random cropping and patching \cite{Takahashi2018} combine multiple input images.
These simple transformations are usually bundled to form more complex augmentation policies like AutoAugment \cite{Cubuk2018} and RandAugment \cite{Cubuk2019}, which automatically search for optimal augmentation policies or 3-augment \cite{Touvron2022} which is optimized to train a ViT.
For a general overview of data augmentation techniques for image classification, we refer to \cite{Shorten2019, Xu2023d}.
These simple transformations are usually bundled to form more complex augmentation policies like AutoAugment \cite{Cubuk2018} and RandAugment \cite{Cubuk2019},
% which automatically search for optimal augmentation policies
or 3-augment \cite{Touvron2022} which is optimized to train a ViT.
For a general overview of data augmentation techniques for image classification, we refer to \citet{Shorten2019, Xu2023d}.
We build upon these general augmentation techniques by introducing a novel approach to explicitly separate and recombine foregrounds and backgrounds for image classification.
Our approach is used in tandem with traditional data augmentation techniques to improve model performance and reduce biases.
We build upon these general augmentations by introducing a novel approach to explicitly separate objects and backgrounds for image classification, allowing us to -- unlike these basic transformations -- move beyond dataset image compositions.
Our approach is used additionally to strong traditional techniques to improve performance and reduce biases.
\paragraph{Copy-Paste Augmentation}
The copy-paste augmentation \cite{Ghiasi2020}, which is used for object detection \cite{Shermaine2025,Ghiasi2020} and instance segmentation \cite{Werman2021,Ling2022}, involves copying segmented objects from one image and pasting them onto another.
While typically human-annotated segmentation masks are used to extract the foreground objects, other foregound sources have been explored, like 3D models \cite{Hinterstoisser2019} and pretrained object-detection models for use on objects on white background \cite{Dwibedi2017} or synthetic images \cite{Ge2023}.
DeePaste \cite{Werman2021} focuses on using inpainting for a more seamless integration of the pasted object.
The copy-paste augmentation \cite{Ghiasi2020}, which is used only for object detection \cite{Shermaine2025,Ghiasi2020} and instance segmentation \cite{Werman2021,Ling2022}, involves copying segmented objects from one image and pasting them onto another.
While typically human annotated segmentation masks are used to extract the foreground objects, other foregound sources have been explored, like 3D models \cite{Hinterstoisser2019} and pretrained object-detection models for use on objects on white background \cite{Dwibedi2017} or synthetic images \cite{Ge2023}.
% DeePaste \cite{Werman2021} focuses on using inpainting for a more seamless integration of the pasted object.
\cite{Kang2022} apply copy-paste as an alternative to CutMix in image classification, but they do not shift the size or position of the foregrounds and use normal dataset images as backgrounds.
Unlike these methods, \name focuses on image classification.
While for detection and segmentation, objects are pasted onto another image (with a different foreground) or on available or rendered background images of the target scene, we extract foreground objects and fill in the resulting holes in the background in a semantically neutral way.
This way, we can recombine any foreground object with a large variety of neutral backgrounds from natural images, enabling a controlled and diverse manipulation of image composition.
% Unlike these methods, \schemename focuses on image classification.
% While these methods paste objects onto another image (with a different foreground) or on available or rendered background images of the target scene, we extract foreground objects and fill in the resulting holes in the background in a semantically neutral way.
Unlike prior copy-paste methods that overlay objects, \schemename extracts foregrounds and replaces their backgrounds with semantically neutral fills, thereby preserving label integrity while enabling controlled and diverse recombination.
% This way, we are preserving label integrity while also having diverse, neutral backgrounds available for recombination, enabling a controlled and diverse manipulation of image composition.
\begin{figure*}[ht!]
\centering
\includegraphics[width=.9\textwidth]{img/fig-2.pdf}
\caption{Overview of \schemename. The data creation consists of two stages: Segmentation (offline, \Cref{sec:segmentation}), where we segment the foreground objects from the background and fill in the background. Recombination (online, \Cref{sec:recombination}), where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.}
\label{fig:method}
\end{figure*}
\paragraph{Model robustness evaluation}
Evaluating model robustness to various image variations is critical for understanding and improving model generalization.
@@ -30,7 +40,7 @@ Datasets like ImageNet-C \cite{Hendrycks2019} and ImageNet-P \cite{Hendrycks2019
ImageNet-E \cite{Li2023e} evaluates model robustness against a collection of distribution shifts.
Other datasets, such as ImageNet-D \cite{Zhang2024f}, focus on varying background, texture, and material, but rely on synthetic data.
Stylized ImageNet \cite{Geirhos2018} investigates the impact of texture changes.
ImageNet-9 \cite{Xiao2020} explores background variations using segmented images, but the backgrounds are often artificial.
ImageNet-9 \cite{Xiao2020} explores background variations using segmented images, but backgrounds are often artificial.
In contrast to these existing datasets, which are used only for evaluation, \name provides fine-grained control over foreground object placement, size, and background selection, enabling a precise and comprehensive analysis of specific model biases within the context of a large-scale, real-world image distribution.
As \name also provides controllable training set generation, it goes beyond simply measuring robustness to actively improving it through training.
In contrast to these existing datasets, which are used only for evaluation, \schemename provides fine-grained control over foreground object placement, size, and background selection, enabling a precise and comprehensive analysis of specific model biases within the context of a large-scale, real-world image distribution.
As \schemename also provides controllable training set generation, it goes beyond simply measuring robustness to actively improving it through training.

228
sec/reproducability.tex Normal file
View File

@@ -0,0 +1,228 @@
% !TeX root = ../main.tex
\makeatletter
% \@ifundefined{isChecklistMainFile}{
% % We are compiling a standalone document
% \newif\ifreproStandalone
% \reproStandalonetrue
% }{
% We are being \input into the main paper
\newif\ifreproStandalone
\reproStandalonefalse
% }
\makeatother
\ifreproStandalone
\documentclass[letterpaper]{article}
\usepackage[submission]{aaai2026}
\setlength{\pdfpagewidth}{8.5in}
\setlength{\pdfpageheight}{11in}
\usepackage{times}
\usepackage{helvet}
\usepackage{courier}
\usepackage{xcolor}
\frenchspacing
\begin{document}
\fi
\setlength{\leftmargini}{20pt}
\makeatletter\def\@listi{\leftmargin\leftmargini \topsep .5em \parsep .5em \itemsep .5em}
\def\@listii{\leftmargin\leftmarginii \labelwidth\leftmarginii \advance\labelwidth-\labelsep \topsep .4em \parsep .4em \itemsep .4em}
\def\@listiii{\leftmargin\leftmarginiii \labelwidth\leftmarginiii \advance\labelwidth-\labelsep \topsep .4em \parsep .4em \itemsep .4em}\makeatother
\setcounter{secnumdepth}{0}
\renewcommand\thesubsection{\arabic{subsection}}
\renewcommand\labelenumi{\thesubsection.\arabic{enumi}}
\newcounter{checksubsection}
\newcounter{checkitem}[checksubsection]
\newcommand{\checksubsection}[1]{%
\refstepcounter{checksubsection}%
\paragraph{\arabic{checksubsection}. #1}%
\setcounter{checkitem}{0}%
}
\newcommand{\checkitem}{%
\refstepcounter{checkitem}%
\item[\arabic{checksubsection}.\arabic{checkitem}.]%
}
\newcommand{\question}[2]{\normalcolor\checkitem #1 #2 \color{blue}}
\newcommand{\ifyespoints}[1]{\makebox[0pt][l]{\hspace{-15pt}\normalcolor #1}}
\section*{Reproducibility Checklist}
\vspace{1em}
\hrule
\vspace{1em}
\textbf{Instructions for Authors:}
This document outlines key aspects for assessing reproducibility. Please provide your input by editing this \texttt{.tex} file directly.
For each question (that applies), replace the ``Type your response here'' text with your answer.
\vspace{1em}
\noindent
\textbf{Example:} If a question appears as
%
\begin{center}
\noindent
\begin{minipage}{.9\linewidth}
\ttfamily\raggedright
\string\question \{Proofs of all novel claims are included\} \{(yes/partial/no)\} \\
Type your response here
\end{minipage}
\end{center}
you would change it to:
\begin{center}
\noindent
\begin{minipage}{.9\linewidth}
\ttfamily\raggedright
\string\question \{Proofs of all novel claims are included\} \{(yes/partial/no)\} \\
yes
\end{minipage}
\end{center}
%
Please make sure to:
\begin{itemize}\setlength{\itemsep}{.1em}
\item Replace ONLY the ``Type your response here'' text and nothing else.
\item Use one of the options listed for that question (e.g., \textbf{yes}, \textbf{no}, \textbf{partial}, or \textbf{NA}).
\item \textbf{Not} modify any other part of the \texttt{\string\question} command or any other lines in this document.\\
\end{itemize}
You can \texttt{\string\input} this .tex file right before \texttt{\string\end\{document\}} of your main file or compile it as a stand-alone document. Check the instructions on your conference's website to see if you will be asked to provide this checklist with your paper or separately.
\vspace{1em}
\hrule
\vspace{1em}
% The questions start here
\checksubsection{General Paper Structure}
\begin{itemize}
\question{Includes a conceptual outline and/or pseudocode description of AI methods introduced}{(yes/partial/no/NA)}
yes
\question{Clearly delineates statements that are opinions, hypothesis, and speculation from objective facts and results}{(yes/no)}
yes
\question{Provides well-marked pedagogical references for less-familiar readers to gain background necessary to replicate the paper}{(yes/no)}
yes
\end{itemize}
\checksubsection{Theoretical Contributions}
\begin{itemize}
\question{Does this paper make theoretical contributions?}{(yes/no)}
no
\ifyespoints{\vspace{1.2em}If yes, please address the following points:}
\begin{itemize}
\question{All assumptions and restrictions are stated clearly and formally}{(yes/partial/no)}
Type your response here
\question{All novel claims are stated formally (e.g., in theorem statements)}{(yes/partial/no)}
Type your response here
\question{Proofs of all novel claims are included}{(yes/partial/no)}
Type your response here
\question{Proof sketches or intuitions are given for complex and/or novel results}{(yes/partial/no)}
Type your response here
\question{Appropriate citations to theoretical tools used are given}{(yes/partial/no)}
Type your response here
\question{All theoretical claims are demonstrated empirically to hold}{(yes/partial/no/NA)}
Type your response here
\question{All experimental code used to eliminate or disprove claims is included}{(yes/no/NA)}
Type your response here
\end{itemize}
\end{itemize}
\checksubsection{Dataset Usage}
\begin{itemize}
\question{Does this paper rely on one or more datasets?}{(yes/no)}
yes
\ifyespoints{If yes, please address the following points:}
\begin{itemize}
\question{A motivation is given for why the experiments are conducted on the selected datasets}{(yes/partial/no/NA)}
yes
\question{All novel datasets introduced in this paper are included in a data appendix}{(yes/partial/no/NA)}
no
\question{All novel datasets introduced in this paper will be made publicly available upon publication of the paper with a license that allows free usage for research purposes}{(yes/partial/no/NA)}
yes
\question{All datasets drawn from the existing literature (potentially including authors' own previously published work) are accompanied by appropriate citations}{(yes/no/NA)}
yes
\question{All datasets drawn from the existing literature (potentially including authors' own previously published work) are publicly available}{(yes/partial/no/NA)}
yes
\question{All datasets that are not publicly available are described in detail, with explanation why publicly available alternatives are not scientifically satisficing}{(yes/partial/no/NA)}
NA
\end{itemize}
\end{itemize}
\checksubsection{Computational Experiments}
\begin{itemize}
\question{Does this paper include computational experiments?}{(yes/no)}
yes
\ifyespoints{If yes, please address the following points:}
\begin{itemize}
\question{This paper states the number and range of values tried per (hyper-) parameter during development of the paper, along with the criterion used for selecting the final parameter setting}{(yes/partial/no/NA)}
yes
\question{Any code required for pre-processing data is included in the appendix}{(yes/partial/no)}
yes
\question{All source code required for conducting and analyzing the experiments is included in a code appendix}{(yes/partial/no)}
yes
\question{All source code required for conducting and analyzing the experiments will be made publicly available upon publication of the paper with a license that allows free usage for research purposes}{(yes/partial/no)}
yes
\question{All source code implementing new methods have comments detailing the implementation, with references to the paper where each step comes from}{(yes/partial/no)}
yes
\question{If an algorithm depends on randomness, then the method used for setting seeds is described in a way sufficient to allow replication of results}{(yes/partial/no/NA)}
yes
\question{This paper specifies the computing infrastructure used for running experiments (hardware and software), including GPU/CPU models; amount of memory; operating system; names and versions of relevant software libraries and frameworks}{(yes/partial/no)}
yes
\question{This paper formally describes evaluation metrics used and explains the motivation for choosing these metrics}{(yes/partial/no)}
yes
\question{This paper states the number of algorithm runs used to compute each reported result}{(yes/no)}
yes
\question{Analysis of experiments goes beyond single-dimensional summaries of performance (e.g., average; median) to include measures of variation, confidence, or other distributional information}{(yes/no)}
yes
\question{The significance of any improvement or decrease in performance is judged using appropriate statistical tests (e.g., Wilcoxon signed-rank)}{(yes/partial/no)}
no
\question{This paper lists all final (hyper-)parameters used for each model/algorithm in the papers experiments}{(yes/partial/no/NA)}
yes
\end{itemize}
\end{itemize}
\ifreproStandalone
\end{document}
\fi