cvpr submission

This commit is contained in:
Tobias Christian Nauen
2026-02-24 12:01:26 +01:00
parent 5c08f9d31a
commit e7c0b531d6
59 changed files with 7238 additions and 4939 deletions

View File

@@ -1,199 +1,343 @@
% !TeX root = ../main.tex
\begin{figure}[t]
\begin{minipage}[t]{.62\textwidth}
\captionof{table}{ImageNet results when training ViTs with different data augmentation pipelines.
\schemename consistently improves performance in low- and mid-augmentation regimes and remains complementary to strong augmentation pipelines, with larger gains for larger models.
}
\label{tab:imagenet-pipelines}
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{lccccc}
\toprule
\multirow{2.5}{*}{Augmentation} & \multirow{2.5}{*}{MixUp} & \multirow{2.5}{*}{CutMix} & \multicolumn{3}{c}{Accuracy [\%] using} \\
\cmidrule(l){4-6}
& & & ViT-S & ViT-B & ViT-L \\
\midrule
Basic & \xmark & \xmark & $71.9 \pm 0.1$ & $69.5 \pm 0.2$ & $68.3 \pm 0.4$ \\
Basic + \schemename & \xmark & \xmark & $75.7 \pm 0.2$ & $75.5 \pm 0.6$ & $73.1 \pm 1.7$ \\
& & & \grntxt{$+3.8$} & \grntxt{$+6.0$} & \grntxt{$+4.8$} \\
\midrule
RandAugment & \xmark & \xmark & $76.3 \pm 0.5$ & $75.5 \pm 0.2$ & $74.7 \pm 0.4$ \\
RandAugment + \schemename & \xmark & \xmark & $78.0 \pm 0.1$ & $77.8 \pm 0.1$ & $78.0 \pm 0.6$ \\
& & & \grntxt{$+1.7$} & \grntxt{$+2.3$} & \grntxt{$+3.3$} \\
\midrule
Basic & \cmark & \cmark & $79.8 \pm 0.3$ & $78.6 \pm 0.4$ & $78.1 \pm 1.6$ \\
Basic + \schemename & \cmark & \cmark & $79.8 \pm 0.3$ & $81.6 \pm 0.5$ & $81.0 \pm 0.4$ \\
& & & \gtxt{$\pm 0.0$} & \grntxt{$+3.0$} & \grntxt{$+2.9$} \\
\midrule
3-Augment & \xmark & \cmark & $79.1 \pm 0.1$ & $77.6 \pm 0.2$ & $75.3 \pm 0.4$ \\
3-Augment + \schemename & \xmark & \cmark & $81.4 \pm 0.1$ & $81.1 \pm 0.4$ & $79.8 \pm 0.1$ \\
& & & \grntxt{$+2.3$} & \grntxt{$+3.5$} & \grntxt{$+4.5$} \\
\midrule
RandAugment & \cmark & \cmark & $80.1 \pm 0.1$ & $81.9 \pm 0.3$ & $79.3 \pm 2.3$ \\
RandAugment + \schemename & \cmark & \cmark & $80.0 \pm 0.3$ & $81.9 \pm 0.2$ & $82.4 \pm 0.1$ \\
& & & \gtxt{$-0.1$} & \gtxt{$\pm 0.0$} & \grntxt{$+3.1$} \\
\bottomrule
\end{tabular}
}
\end{minipage}
\hfill
\begin{minipage}[t]{.37\textwidth}
\captionof{table}{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.}
\label{tab:imagenet-results}
\resizebox{\textwidth}{!}{\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& w/o \schemename & w/ \schemename & \\
\midrule
ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{$+2.3$} \\
ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{$+3.5$} \\
ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{$+4.5$} \\
\midrule
DeiT-S & $80.1 \pm 0.1$ & $80.0\pm0.3$ & \gtxt{$-0.1$} \\
DeiT-B & $81.9 \pm 0.3$ & $81.9\pm0.2$ & \gtxt{$\pm0.0$} \\
DeiT-L & $79.3\pm2.3$ & $82.4\pm0.1$ & \grntxt{$+3.1$} \\
\midrule
Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{$+1.8$} \\
Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{$+1.2$} \\
\midrule
ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{$+0.5$} \\
ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\
\bottomrule
\end{tabular}}
\end{minipage}
\end{figure}
% \begin{table}[t]
% \caption{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.}
% \label{tab:imagenet-results}
% \centering
% \begin{subfigure}{.41\textwidth}
% \resizebox{\textwidth}{!}{\begin{tabular}{lccc}
% \toprule
% \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
% \cmidrule(lr){2-3}
% & w/o \schemename & w/ \schemename & \\
% \midrule
% ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{$+2.3$} \\
% ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{$+3.5$} \\
% ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{$+4.5$} \\
% \midrule
% Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{$+1.8$} \\
% Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{$+1.2$} \\
% \bottomrule
% \end{tabular}}
% \end{subfigure}
% \hspace{5pt}
% \begin{subfigure}{.448\textwidth}
% \resizebox{\textwidth}{!}{\begin{tabular}{lccc}
% \toprule
% \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
% \cmidrule(lr){2-3}
% & w/o \schemename & w/ \schemename & \\
% \midrule
% DeiT-S & $80.1 \pm 0.1$ & $80.0\pm0.3$ & \gtxt{$-0.1$} \\
% DeiT-B & $81.9 \pm 0.3$ & $81.9\pm0.2$ & \gtxt{$\pm0.0$} \\
% DeiT-L & $79.3\pm2.3$ & $82.4\pm0.1$ & \grntxt{$+3.1$} \\
% \midrule
% ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{$+0.5$} \\
% ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\
% \bottomrule
% \end{tabular}}
% \end{subfigure}
% \end{table}
\section{Experiments}
\label{sec:experiments}
% \begin{itemize}
% \item [1.] Training on RecombiNet
% \item ImageNet results (large)
% \item Ablation (TinyImageNet): Foreground position
% \item Ablation (TinyImageNet): Which background (or part of other ablation table?)
% \item Ablation (TinyImageNet+ImageNet For edge blur): Design decisions: Which infill model, pruning threshold, p$\to$t /t$\to$p, foreground rotation range (?), edge blur, original image probability/schedule, Foreground size
% \item With other Data Augmentations
% \item [2.] More evalution metrics
% \item Background accuracy (how to frame/sell? Background bias?) / Background robustness (= foreground with all background)?
% \item Foreground focus
% \item Position bias
% \item Size bias
% \end{itemize}
We conduct a comprehensive suit of experiments to validate the effectiveness of our approach,
comparing ImageNet training with and without \schemename for 10 different models and 5 data augmentation pipelines.
% We compare training on \name, the ImageNet instantiation of \schemename, to training on ImageNet for 10 different models.
comparing ImageNet-training with and without \schemename for 10 different models.
Furthermore, we assess the impact of using \schemename for pretraining on multiple fine-grained downstream datasets.
Finally, we exploit \schemename's control over the image distribution to quantify model behaviors and biases.
We always report the mean and standard deviation of three independent training runs.
\subsection{Image Classification Results}
\subsection{Design Choices of ForAug}
\label{sec:ablation}
\textbf{ImageNet training.}
\Cref{tab:imagenet-pipelines} analyzes the effect of \schemename under different data augmentation pipelines:
A \emph{basic} pipeline with RandomResizedCrop, Flip and ColorJitter, the \emph{3-Augment} pipeline from \cite{Touvron2022,Nauen2025} that also includes Grayscale, Solarization and GaussianBlur, as well as the widely used \emph{RandAugment}~\cite{Cubuk2020} based pipeline from DeiT~\cite{Touvron2021b}.
Additionally, we include MixUp~\cite{Zhang2018a} and CutMix~\cite{Yun2019} augmentations.
% We also include Mixup and CutMix.
We find that the effectiveness of \schemename depends on the interplay between model capacity and baseline augmentation strength.
When the baseline augmentation is weak or moderate, \schemename consistently improves ImageNet accuracy, with gains increasing for larger ViT models (up to $+6.0$ p.p.\ for ViT-B).
As the augmentation pipeline becomes stronger (e.g., RandAugment with MixUp and CutMix), ImageNet improvements diminish for smaller models, indicating that the baseline augmentation already saturates their capacity.
Importantly, even in cases where ImageNet accuracy does not improve, we consistently observe gains during downstream fine-tuning (see \Cref{tab:downstream-results}), suggesting that \schemename enhances representation quality beyond what is reflected by ImageNet accuracy.
We start by ablating the design choices of \schemename on TinyImageNet~\cite{Le2015}, a subset of ImageNet containing 200 categories with 500 images each. %, and Tiny\name, the application of \schemename to TinyImageNet.
% \Cref{tab:ablation} presents the results of these ablations.
\Cref{tab:ablation-segment} presents ablations for segmentation and \Cref{tab:ablation-recombine} for recombination.
\Cref{tab:imagenet-results} additionally compares performance of different model architectures.
ViT~\cite{Dosovitskiy2021}, Swin~\cite{Liu2021} and ResNet~\cite{He2016} (representing CNNs) are trained using the ``3-augment'' strategy, while DeiT~\cite{Touvron2021b} is trained using the ``RandAugment'' strategy.
Notably, \schemename improves performance across all tested architectures, including the ResNet models, % (up to $1$ p.p.),
demonstrating benefits beyond Transformers.
% We find that \schemename's improvements counteract the drop in performance for increasing model sizes.
% Without \schemename this drop is $3.8$ p.p. (ViT-S to L), while with \schemename it is reduced to $1.6$ p.p.
% For DeiT there is a drop of $0.8$ p.p. from small to large while when using \schemename there is a \emph{gain} of $2.4$ p.p.
\begin{table}
\caption{Ablation of the design decisions in the segmentation phase of \schemename on TinyImageNet.
The first line is our baseline, while the other lines are using \schemename.
We use basic settings with the \emph{same} background strategy during recombination for this experiment.
}
\label{tab:ablation-segment}
\centering
\small
\resizebox{.9\columnwidth}{!}{
\begin{tabular}{cccc}
\toprule
\multirow{2.5}{*}{\makecell{Detect. \\Prompt}} & \multirow{2.5}{*}{\makecell{Infill \\ Model}} & \multicolumn{2}{c}{TinyImageNet Accuracy [\%]} \\
\cmidrule{3-4}
& & ViT-Ti & ViT-S \\
\midrule
\multicolumn{2}{l}{\textbf{TinyImageNet}} & $66.1 \pm 0.5$ & $68.3 \pm 0.7$ \\
specific & LaMa \cite{Suvorov2021} & $65.5 \pm 0.4$ & $71.2 \pm 0.5$ \\
general & \gtxt{LaMa \cite{Suvorov2021}} & $66.4 \pm 0.6$ & $72.9 \pm 0.6$ \\
\gtxt{general} & Att. Eraser \cite{Sun2024} & $67.5 \pm 1.2$ & $72.4 \pm 0.5$ \\
\bottomrule
\end{tabular}}
\end{table}
\begin{table}[t]
\caption{Downstream accuracy in percent when finetuning on other datasets. Models are pretrained on ImageNet with and without \schemename. Pretraining using \schemename increases transformer downstream accuracy.
% on all datasets.
}
\label{tab:downstream-results}
\begin{subfigure}{.48\columnwidth}
\resizebox{\textwidth}{!}{\begin{tabular}{lcccccc}
\toprule
Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\
\midrule
ViT-S & \xmark & $72.4\pm1.0$ & $89.8\pm0.3$ & $94.5\pm0.2$ & $89.1\pm0.1$ & $93.8\pm0.2$ \\
ViT-S & \cmark & $78.6\pm0.5$ & $92.2\pm0.2$ & $95.5\pm0.2$ & $89.6\pm0.1$ & $94.5\pm0.2$ \\
& & \grntxt{$+6.2$} & \grntxt{$+2.4$} & \grntxt{$+1.0$} & \grntxt{$+0.5$} & \grntxt{$+0.7$} \\
\midrule
ViT-B & \xmark & $71.7\pm0.5$ & $90.0\pm0.2$ & $94.8\pm0.4$ & $89.8\pm0.2$ & $94.1\pm0.4$ \\
ViT-B & \cmark & $79.0\pm2.2$ & $93.3\pm0.1$ & $ 96.5\pm0.1$ & $90.9\pm0.1$ & $95.1\pm0.4$ \\
& & \grntxt{$+7.3$} & \grntxt{$+3.3$} & \grntxt{$+1.7$} & \grntxt{$+1.1$} & \grntxt{$+1.0$} \\
\midrule
ViT-L & \xmark & $72.1\pm1.0$ & $88.8\pm0.3$ & $94.4\pm0.3$ & $90.1\pm0.2$ & $94.2\pm0.4$ \\
ViT-L & \cmark & $77.6\pm1.2$ & $89.1\pm0.2$ & $96.6\pm0.1$ & $91.3\pm0.1$ & $95.1\pm0.1$ \\
& & \grntxt{$+5.5$} & \grntxt{$+0.3$} & \grntxt{$+2.2$} & \grntxt{$+1.2$} & \grntxt{$+0.9$} \\
\midrule
Swin-Ti & \xmark & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\
Swin-Ti & \cmark & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\
& & \grntxt{$+4.1$} & \grntxt{$+2.5$} & \grntxt{$+0.3$} & \grntxt{$+0.4$} & \grntxt{$+0.6$} \\
\midrule
Swin-S & \xmark & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\
Swin-S & \cmark & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\
& & \grntxt{$+5.7$} & \grntxt{$+2.1$} & \grntxt{$+1.4$} & \gtxt{$+0.1$} & \grntxt{$+0.5$} \\
\bottomrule
\end{tabular}}
\end{subfigure}
\hfill
\begin{subfigure}{.505\columnwidth}
\resizebox{\textwidth}{!}{\begin{tabular}{lcccccc}
\toprule
Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\
\midrule
DeiT-S & \xmark & $75.3\pm0.4$ & $91.1\pm0.2$ & $94.8\pm0.4$ & $89.2\pm0.2$ & $92.4\pm0.2$ \\
DeiT-S & \cmark & $76.8\pm0.8$ & $91.9\pm0.2$ & $95.2\pm0.3$ & $89.1\pm0.2$ & $92.3\pm0.4$ \\
& & \grntxt{$+1.5$} & \grntxt{$+0.8$} & \grntxt{$+0.4$} & \gtxt{$-0.1$} & \gtxt{$-0.1$} \\
\midrule
DeiT-B & \xmark & $77.0\pm1.2$ & $92.9\pm0.2$ & $96.1\pm0.2$ & $91.2\pm0.1$ & $93.3\pm0.4$ \\
DeiT-B & \cmark & $79.3\pm0.3$ & $93.1\pm0.1$ & $96.4\pm0.2$ & $91.3\pm0.1$ & $93.3\pm0.1$ \\
& & \grntxt{$+2.3$} & \gtxt{$+0.2$} & \grntxt{$+0.3$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
\midrule
DeiT-L & \xmark & $72.8\pm5.5$ & $92.8\pm1.0$ & $95.8\pm1.5$ & $90.5\pm2.6$ & $92.4\pm2.0$ \\
DeiT-L & \cmark & $78.8\pm0.8$ & $93.8\pm0.2$ & $97.0\pm0.2$ & $92.0\pm0.2$ & $93.5\pm0.2$ \\
& & \grntxt{$+6.0$} & \grntxt{$+1.0$} & \grntxt{$+1.2$} & \grntxt{$+1.5$} & \grntxt{$+1.1$} \\
\midrule
ResNet-50 & \xmark & $78.2\pm0.5$ & $89.8\pm0.2$ & $91.7\pm0.4$ & $84.4\pm0.2$ & $93.7\pm0.3$ \\
ResNet-50 & \cmark & $80.3\pm0.4$ & $90.4\pm0.2$ & $91.7\pm0.2$ & $84.5\pm0.2$ & $93.7\pm0.3$ \\
& & \grntxt{$+2.1$} & \grntxt{$+0.6$} & \gtxt{$\pm0.0$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
\midrule
ResNet-101 & \xmark & $78.4\pm0.6$ & $90.3\pm0.1$ & $91.2\pm0.5$ & $86.0\pm0.2$ & $94.3\pm0.2$ \\
ResNet-101 & \cmark & $81.4\pm0.5$ & $91.3\pm0.1$ & $92.9\pm0.2$ & $86.3\pm0.1$ & $94.0\pm0.3$ \\
& & \grntxt{$+3.0$} & \grntxt{$+1.3$} & \grntxt{$+1.7$} & \grntxt{$+0.3$} & \textcolor{red}{$-0.3$} \\
\bottomrule
\end{tabular}}
\end{subfigure}
\caption{Ablation of the recombination phase of \schemename on TinyImageNet (top) and ImageNet (bottom). The first experiments use the initial segmentation settings with LaMa \cite{Suvorov2021}.}
\label{tab:ablation-recombine}
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{ccccccccccc}
\toprule
% FG. & Augment. & BG. & BG. & Edge & Original & \multicolumn{2}{c}{Accuracy [\%]} \\
% Size & Order & Strat. & Prune & Smoothing & Mixing & ViT-Ti & ViT-S \\
\multirow{2.5}{*}{\makecell{FG. \\size}} & \multirow{2.5}{*}{\makecell{Augment.\\Order}} & \multirow{2.5}{*}{\makecell{BG\\Strat.}} & \multirow{2.5}{*}{\makecell{BG.\\Prune}} & \multirow{2.5}{*}{\makecell{Original\\Mixing}} & \multirow{2.5}{*}{\makecell{Edge\\Smooth.}} & \multicolumn{2}{c}{Accuracy [\%]} \\
\cmidrule{7-8}
& & & & & & ViT-Ti & ViT-S \\
\midrule
% TinyImageNet & & & & & & & $66.1\pm0.5$ & $68.3\pm0.7$ \\
\multicolumn{6}{l}{\textbf{TinyImageNet}} & \gtxt{$66.1\pm0.5$} & \gtxt{$68.3\pm0.7$} \\
mean & crop$\to$paste & same & - & - & \gtxt{-} & $64.6\pm0.5$ & $70.0\pm0.6$ \\
range & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $65.5\pm0.4$ & $71.2\pm0.5$ \\
\midrule
% \gtxt{range} & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $66.4\pm0.6$ & $72.9\pm0.6$ \\
{range} & {crop$\to$paste} & {same} & {-} & {-} & {-} & $67.5\pm1.2$ & $72.4\pm0.5$ \\
\gtxt{range} & paste$\to$crop & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.1\pm1.2$ & $72.9\pm0.5$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 1.0 & \gtxt{-} & \gtxt{-} & $67.0\pm1.2$ & $73.0\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.8 & \gtxt{-} & \gtxt{-} & $67.2\pm1.2$ & $72.9\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.6 & \gtxt{-} & \gtxt{-} & $67.5\pm1.0$ & $72.8\pm0.7$ \\
% \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 2.0$ & \gtxt{-} & $67.2\pm0.4$ & $72.9\pm0.5$ \\
% \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{-} & $65.9\pm0.5$ & $72.4\pm0.6$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.2$ & \gtxt{-} & $69.8\pm0.5$ & $75.0\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.33$ & \gtxt{-} & $69.5\pm0.4$ & $75.2\pm1.0$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.5$ & \gtxt{-} & $70.3\pm1.0$ & $74.2\pm0.2$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & linear & \gtxt{-} & $70.1\pm0.7$ & $74.9\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & reverse lin. & \gtxt{-} & $67.6\pm0.2$ & $73.2\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & cos & \gtxt{-} & $71.3\pm1.0$ & $75.7\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & $70.0\pm0.8$ & $75.5\pm0.7$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & orig. & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $67.2\pm0.9$ & $69.9\pm1.0$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $70.1\pm0.7$ & $77.5\pm0.6$ \\
\midrule
\multicolumn{6}{l}{\textbf{ImageNet}} & \gtxt{-} & \gtxt{$79.1\pm0.1$} \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & \gtxt{-} & - & $80.5\pm0.1$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & - & $80.7\pm0.1$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & - & $81.4\pm0.1$ \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Prompt.}
% We present the ablation of our main design decisions in \Cref{tab:ablation}.
First, we evaluate the type of prompt used to detect the foreground object.
Here, the \emph{general} prompt, which contains the class and the more general object category, outperforms only having the class name (\emph{specific}).
\textbf{Inpainting.} Among inpainting models, Attentive Eraser~\cite{Sun2024} produces slightly better results compared to LaMa~\cite{Suvorov2021} ($+0.5$ p.p. on average).
For inpainting examples, see the supplementary material.
% (see the supplementary material for examples).
% When comparing the infill models, the GAN-based LaMa \cite{Suvorov2021} gets outperformed by the Attentive Eraser \cite{Sun2024}.
\textbf{Foreground size}
% We observe that LaMa's often infills unnatural textures compared to Attentive Eraser.
% The size of foreground objects during training has a significant impact on the performance.
% Here, using the greater variability of the \emph{range} strategy increases the performance by $\approx 1\%$ compared to the \emph{mean} strategy.
significantly impacts performance.
Employing a \emph{range} of sizes during recombination, rather than a fixed \emph{mean} size, boosts accuracy by approximately 1 p.p.
This suggests that the added variability is beneficial.
\textbf{Order of data augmentation.}
% (1) Applying the image crop related augmentations \emph{before} pasting the foreground object and the color-based ones \emph{after} pasting or (2) applying all data augmentations after pasting the foreground object.
% While results are ambiguous, we choose the second strategy, as it improves the performance of ViT-S, although not the one of ViT-Ti.
Applying all augmentations after foreground-background recombination (\emph{paste$\to$crop$\to$color}) improves ViT-S's performance compared to applying crop-related augmentations before pasting (\emph{crop$\to$paste$\to$color}).
ViT-Ti results are ambiguous.
\textbf{Background pruning.}
When it comes to the backgrounds to use, we test different pruning thresholds ($t_\text{prune}$) to exclude backgrounds with large inpainting.
% and only use backgrounds with an relative size of the infilled region of at most $t_\text{prune}$ (exclusive).
A threshold of $t_\text{prune}=1.0$ means that we use all backgrounds that are not fully infilled.
% We find that the background pruning does not significantly impact the models' performance.
% We choose $t_\text{prune}=0.8$ for the following experiments to exclude backgrounds that are mostly artificial.
Varying $t_\text{prune}$ has minimal impact.
We choose $t_\text{prune} = 0.8$ to exclude predominantly artificial backgrounds.
% One of the most important design decisions is the mixing of the original dataset with \name.
\textbf{Mixing} \schemename-augmented samples with the original ImageNet data proves crucial.
While constant and linear mixing schedules improve performance over no mixing by $2-3$ p.p. compared to only augmented samples, the cosine annealing schedule proves optimal, boosting accuracy by $3-4$ p.p.
\textbf{Edge smoothing.}
We evaluate the impact of using Gaussian blurring to smooth the edges of the foreground masks.
% Similarly, applying edge smoothing to foreground masks with Gaussian blurring actually hurts performance on Tiny\name, but slightly improves it on \name.
For larger models, this gives us a slight performance boost on the full ImageNet (second to last line in \Cref{tab:ablation-recombine}).
\textbf{Background strategy.}
Another point is the allowed choice of background image for each foreground object.
% We evaluate three different strategies.
% (1) Picking the background from which that specific foreground was originally extracted.
% The major difference to ImageNet when using this setup is the variability in size and position of the foreground object.
% (2) Picking a background that originally had a foreground object of the same class in it.
% Here, we have backgrounds where objects of this type can typically appear while also creating a wider variety of samples due to pairing each foreground object with different backgrounds each time.
% (3) Picking any background.
% This choice has the largest variety of backgrounds, but the backgrounds are not semantically related to the foreground object anymore.
% We find in \Cref{fig:bg-strategy} that choosing only a foreground's original background is the worst choice.
We compare using the original background, a background from the same class, and any background.
These strategies go from low diversity and high shared information content between the foreground and background to high diversity and low shared information content.
For \emph{ViT-Ti}, the latter two strategies perform comparably, while \emph{ViT-S} benefits from the added diversity of using any background.
The same is true when training on the full ImageNet.
\begin{table}
\caption{Accuracy of ViT-S on TinyImageNet (TIN) in percent using \schemename with different foreground position distributions by varying the Bates parameter $\eta$.
The best performance is achieved when using the uniform distribution ($\eta=1$) for training.}
\label{tbl:foreground-eta}
\centering
\small
\resizebox{.9\columnwidth}{!}{
\begin{tabular}{ccccccc}
\toprule
\multirow{2.5}{*}{\makecell{Bates Parameter \\during training}} & \multirow{2.5}{*}{\makecell{TIN \\w/o \schemename}} & \multicolumn{5}{c}{TIN w/ \schemename} \\
\cmidrule(l){3-7}
& & $\eta=-3$ & $-2$ & $1/-1$ & $2$ & $3$ \\
\midrule
Baseline & 68.9 & 60.5 & 60.2 & 60.8 & 62.6 & 63.1 \\
$\eta=-3$ & 71.3 & 79.3 & 79.5 & 79.1 & 79.3 & 79.1 \\
$\eta=-2$ & 71.5 & 80.0 & 78.7 & 79.3 & 79.1 & 78.8 \\
$\eta=1/-1$ & 72.3 & 79.5 & 78.9 & 80.2 & 79.7 & 80.4 \\
$\eta=2$ & 71.3 & 78.2 & 77.8 & 79.1 & 79.6 & 79.9 \\
$\eta=3$ & 71.4 & 77.2 & 76.9 & 78.6 & 79.6 & 79.7 \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Foreground position.}
Finally, we analyze the foreground object's positioning in the image, using a
generalization of the Bates distribution~\cite{Bates1955} with parameter $\eta \in \Z$.
The Bates distribution presents an easy way to sample from a bounded domain with just one hyperparameter that controls its concentration.
$\eta = 1/-1$ corresponds to the uniform distribution; $\eta > 1$ concentrates the distribution around the center; and for $\eta < -1$, the distribution is concentrated at the borders (see supplementary material for details).
% We utilize an extended Bates distribution to sample the position of the foreground object.
% The Bates distribution with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}.
% The larger $\eta$, the more concentrated the distribution is at the center, $\eta < -1$ concentrates the distribution at the edges.
% We extend this concept to $\eta \leq -1$, shifting the distribution away from the center and towards the edges.
When sampling more towards the center of the image, the difficulty of the task is reduced, which reduces performance on TinyImageNet (\Cref{tbl:foreground-eta}).
This is reflected in the performance when evaluating using \schemename with $\eta=2$ and $\eta=3$ compared to $\eta=-1/1$.
We observe a similar reduction for $\eta < -1$.
% This experiment is conducted using the LaMa infill model.
\begin{table}
\caption{Dataset statistics for TinyImageNet and ImageNet with and without \schemename. For \schemename we report the number of foreground/background pairs.}
\label{tab:dataset-stats}
\centering
\resizebox{.9\columnwidth}{!}{
\begin{tabular}{l S[table-format=4.0] S[table-format=7.0] S[table-format=5.0]}
\toprule
Dataset & {Classes} & {\makecell{Training \\ Images}} & {\makecell{Validation \\ Images}} \\
\midrule
TinyImageNet & 200 & 100000 & 10000 \\
TinyImageNet + \schemename & 200 & 99404 & 9915 \\
ImageNet & 1000 & 1281167 & 50000 \\
ImageNet + \schemename & 1000 & 1274557 & 49751 \\
\bottomrule
\end{tabular}}
\end{table}
After fixing the optimal design parameters in \Cref{tab:ablation-segment,tab:ablation-recombine} (last rows), we run \schemename's segmentation step on the entire ImageNet dataset.
\Cref{tab:dataset-stats} shows the resulting dataset statistics.
% The slightly lower number of images in \name is due to \emph{Grounded SAM} returning no or invalid detections for some images.
The slightly reduced image count for \schemename is due to instances where Grounded SAM fails to produce valid segmentation masks.
\subsection{Image Classification Results}
\begin{table}
\caption{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.}
\label{tab:imagenet-results}
\centering
\small
\resizebox{.8\columnwidth}{!}{\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& w/o \schemename & w/ \schemename & \\
\midrule
ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{$+2.3$} \\
ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{$+3.5$} \\
ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{$+4.5$} \\
\midrule
DeiT-S & $80.1 \pm 0.1$ & $80.0\pm0.3$ & \gtxt{$-0.1$} \\
DeiT-B & $81.9 \pm 0.3$ & $81.9\pm0.2$ & \gtxt{$\pm0.0$} \\
DeiT-L & $79.3\pm2.3$ & $82.4\pm0.1$ & \grntxt{$+3.1$} \\
\midrule
Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{$+1.8$} \\
Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{$+1.2$} \\
\midrule
ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{$+0.5$} \\
ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\
\bottomrule
\end{tabular}}
\end{table}
\Cref{tab:imagenet-results} compares the ImageNet performance of models trained with and without \schemename.
We adopt the training setup of \cite{Nauen2025} and \cite{Touvron2022} for training ViT \cite{Dosovitskiy2021}, Swin \cite{Liu2021} and ResNet \cite{He2016} (representing CNNs) models as well as the setup of DeiT \cite{Touvron2021b} for that model.
Both setups are using strong data augmentations like RandAugment, CutMix, and Mixup optimized for Transformers (details in supplementary material).
Notably, \schemename improves performance across all tested architectures, including the ResNet models, % (up to $1$ p.p.),
demonstrating benefits beyond Transformers.
For DeiT we only observe benefits on ImageNet for the larger models.
For other transformers, we observe improvements from $1.2$ p.p. to $4.5$ p.p. with increasing gains for larger models.
% This improvement is more substantial for the larger models, with ViT-L gaining $4.5$ p.p. in accuracy.
\schemename's improvements counteract the drop in performance for increasing model sizes.
Without \schemename this drop is $3.8$ p.p. (ViT-S to L), while with \schemename it is reduced to $1.6$ p.p.
For DeiT there is a drop of $0.8$ p.p. from small to large while when using \schemename there is a \emph{gain} of $2.4$ p.p.
\begin{table}
\caption{Comparison of \schemename and simple Copy-Paste methods. We train ViT-S on ImageNet using the same 3-augment data augmentation on top of the copy-paste augmentation.}
\label{tab:copy-paste-comparison}
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{lcc S[table-format=+2.1,retain-explicit-plus,detect-inline-weight=math,detect-weight=true]}
\toprule
Augmentation & labels & \makecell{ Accuracy [\%]} & {\makecell{Delta \\to Prev.}} \\
\midrule
% Baseline & & $79.1 \pm 0.1$ \\
Baseline + \textbf{Simple Copy-Paste} & bg & $31.3 \pm 0.6$ & \\
+ mixed labels & fg + bg & $32.0 \pm 0.8$ & +0.7 \\
+ fg labels & fg & $31.6 \pm 0.9$ & -0.4 \\
+ \emph{range} foreground size variation & \gtxt{fg} & $43.0 \pm 1.2$ & \bfseries +11.4 \\
+ infilled backgrounds & \gtxt{fg} & $68.7 \pm 0.2$ & \bfseries +25.7 \\
+ \emph{cos} mixing strategy & \gtxt{fg} & $81.2 \pm 0.1$ & \bfseries +12.5 \\
+ edge smoothing & \gtxt{fg} & $81.3 \pm 0.1$ & +0.1 \\
+ background pruning$=$ \textbf{\schemename} & \gtxt{fg} & $81.4 \pm 0.1$ & +0.1 \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Comparison to Simple Copy-Paste.}
We compare \schemename to a simple adaption of the Copy-Paste augmentation inspired by \cite{Ge2023,Ghiasi2020,Shermaine2025} in \Cref{tab:copy-paste-comparison}.
Contrary to semantic segmentation we do not have foreground masks available.
Thus, we paste the extracted foreground objects from \emph{\schemename's segmentation stage} onto normal ImageNet images.
% Since such images do not have straight forward classification labels, we test multiple possibilities.
We observe 3 large jumps in accuracy: (\textbf{1}) From our \emph{range} foreground size variation (+11.4\%), (\textbf{2}) from using our infilled backgrounds instead of images from the dataset (+25.7\%), and (\textbf{3}) from our \emph{cos} mixing strategy with non-augmented images (+12.5\%).
\schemename's changes to the naive copy-paste augmentation are thus imperative for good classification performance.
\begin{table}[t]
\caption{Downstream accuracy in percent when finetuning on other datasets. Models are pretrained on ImageNet with and without \schemename. Pretraining using \schemename increases transformer downstream accuracy.
% on all datasets.
}
\label{tab:downstream-results}
\centering
\resizebox{\columnwidth}{!}{\begin{tabular}{lcccccc}
\toprule
Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\
\midrule
ViT-S & \xmark & $72.4\pm1.0$ & $89.8\pm0.3$ & $94.5\pm0.2$ & $89.1\pm0.1$ & $93.8\pm0.2$ \\
ViT-S & \cmark & $78.6\pm0.5$ & $92.2\pm0.2$ & $95.5\pm0.2$ & $89.6\pm0.1$ & $94.5\pm0.2$ \\
& & \grntxt{$+6.2$} & \grntxt{$+2.4$} & \grntxt{$+1.0$} & \grntxt{$+0.5$} & \grntxt{$+0.7$} \\
\cmidrule(r){1-1}
ViT-B & \xmark & $71.7\pm0.5$ & $90.0\pm0.2$ & $94.8\pm0.4$ & $89.8\pm0.2$ & $94.1\pm0.4$ \\
ViT-B & \cmark & $79.0\pm2.2$ & $93.3\pm0.1$ & $ 96.5\pm0.1$ & $90.9\pm0.1$ & $95.1\pm0.4$ \\
& & \grntxt{$+7.3$} & \grntxt{$+3.3$} & \grntxt{$+1.7$} & \grntxt{$+1.1$} & \grntxt{$+1.0$} \\
\cmidrule(r){1-1}
ViT-L & \xmark & $72.1\pm1.0$ & $88.8\pm0.3$ & $94.4\pm0.3$ & $90.1\pm0.2$ & $94.2\pm0.4$ \\
ViT-L & \cmark & $77.6\pm1.2$ & $89.1\pm0.2$ & $96.6\pm0.1$ & $91.3\pm0.1$ & $95.1\pm0.1$ \\
& & \grntxt{$+5.5$} & \grntxt{$+0.3$} & \grntxt{$+2.2$} & \grntxt{$+1.2$} & \grntxt{$+0.9$} \\
\midrule
DeiT-S & \xmark & $75.3\pm0.4$ & $91.1\pm0.2$ & $94.8\pm0.4$ & $89.2\pm0.2$ & $92.4\pm0.2$ \\
DeiT-S & \cmark & $76.8\pm0.8$ & $91.9\pm0.2$ & $95.2\pm0.3$ & $89.1\pm0.2$ & $92.3\pm0.4$ \\
& & \grntxt{$+1.5$} & \grntxt{$+0.8$} & \grntxt{$+0.4$} & \gtxt{$-0.1$} & \gtxt{$-0.1$} \\
\cmidrule(r){1-1}
DeiT-B & \xmark & $77.0\pm1.2$ & $92.9\pm0.2$ & $96.1\pm0.2$ & $91.2\pm0.1$ & $93.3\pm0.4$ \\
DeiT-B & \cmark & $79.3\pm0.3$ & $93.1\pm0.1$ & $96.4\pm0.2$ & $91.3\pm0.1$ & $93.3\pm0.1$ \\
& & \grntxt{$+2.3$} & \gtxt{$+0.2$} & \grntxt{$+0.3$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
\cmidrule(r){1-1}
DeiT-L & \xmark & $72.8\pm5.5$ & $92.8\pm1.0$ & $95.8\pm1.5$ & $90.5\pm2.6$ & $92.4\pm2.0$ \\
DeiT-L & \cmark & $78.8\pm0.8$ & $93.8\pm0.2$ & $97.0\pm0.2$ & $92.0\pm0.2$ & $93.5\pm0.2$ \\
& & \grntxt{$+6.0$} & \grntxt{$+1.0$} & \grntxt{$+1.2$} & \grntxt{$+1.5$} & \grntxt{$+1.1$} \\
\midrule
Swin-Ti & \xmark & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\
Swin-Ti & \cmark & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\
& & \grntxt{$+4.1$} & \grntxt{$+2.5$} & \grntxt{$+0.3$} & \grntxt{$+0.4$} & \grntxt{$+0.6$} \\
\cmidrule(r){1-1}
Swin-S & \xmark & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\
Swin-S & \cmark & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\
& & \grntxt{$+5.7$} & \grntxt{$+2.1$} & \grntxt{$+1.4$} & \gtxt{$+0.1$} & \grntxt{$+0.5$} \\
\midrule
ResNet-50 & \xmark & $78.2\pm0.5$ & $89.8\pm0.2$ & $91.7\pm0.4$ & $84.4\pm0.2$ & $93.7\pm0.3$ \\
ResNet-50 & \cmark & $80.3\pm0.4$ & $90.4\pm0.2$ & $91.7\pm0.2$ & $84.5\pm0.2$ & $93.7\pm0.3$ \\
& & \grntxt{$+2.1$} & \grntxt{$+0.6$} & \gtxt{$\pm0.0$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
\cmidrule(r){1-1}
ResNet-101 & \xmark & $78.4\pm0.6$ & $90.3\pm0.1$ & $91.2\pm0.5$ & $86.0\pm0.2$ & $94.3\pm0.2$ \\
ResNet-101 & \cmark & $81.4\pm0.5$ & $91.3\pm0.1$ & $92.9\pm0.2$ & $86.3\pm0.1$ & $94.0\pm0.3$ \\
& & \grntxt{$+3.0$} & \grntxt{$+1.3$} & \grntxt{$+1.7$} & \grntxt{$+0.3$} & \textcolor{red}{$-0.3$} \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Downstream tasks.} To assess the transferability of \schemename-trained models, we finetune models pretrained on ImageNet with and without \schemename on five fine-grained datasets:
@@ -203,102 +347,24 @@ In \Cref{tab:downstream-results} we see transformer accuracies improve on all th
% and a reduction of error rate of up to $39.3\%$.
% Notably, training with \name increases the downstream performance of DeiT-S and DeiT-B, even though the ImageNet results were the same.
% This demonstrates that the improved representations from training on \name translate to superior performance beyond gains from better ImageNet performance.
Notably, training with \schemename boosts the downstream performance of DeiT-S and DeiT-B, despite similar ImageNet accuracy.
This shows, that the improved representations from training with \schemename translate to gains beyond better ImageNet scores.
Notably, training with \schemename boosts the downstream performance of DeiT-S and DeiT-B, despite similar ImageNet results.
This shows the improved representations from training with \schemename translate to gains beyond better ImageNet scores.
% not only on ImageNet, but also on fine-grained image classification tasks.
\begin{table}[t]
\caption{Evaluation of models trained on ImageNet with and without \schemename. \schemename generally increases models' robustness to different image distribution shifts. Note that ViT-S \emph{with} \schemename outperforms DeiT-S, the only model where \schemename does not increase robustness.}
\label{tab:robustness-datasets}
\begin{subfigure}{.485\textwidth}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccccccc}
\toprule
Model & w/ \schemename & IN-Hard & IN-A & IN-C & IN-R & IN-V2 \\
\midrule
ViT-S & \xmark & $18.1 \pm 0.6$ & $18.8 \pm 0.2$ & $44.7 \pm 0.8$ & $41.6 \pm 0.6$ & $67.3 \pm 0.4$ \\
ViT-S & \cmark & $21.0 \pm 0.4$ & $26.5 \pm 0.4$ & $52.6 \pm 0.6$ & $49.8 \pm 0.3$ & $70.6 \pm 0.1$ \\
& & \grntxt{$+2.9$} & \grntxt{$+7.7$} & \grntxt{$+7.9$} & \grntxt{$+8.1$} & \grntxt{$+3.3$} \\
\midrule
ViT-B & \xmark & $17.0 \pm 0.4$ & $15.8 \pm 0.7$ & $40.4 \pm 0.8$ & $38.4 \pm 0.7$ & $65.1 \pm 0.6$ \\
ViT-B & \cmark & $22.0 \pm 0.9$ & $31.9 \pm 1.5$ & $51.6 \pm 1.8$ & $48.7 \pm 1.7$ & $70.3 \pm 0.9$ \\
& & \grntxt{$+5.0$} & \grntxt{$+16.0$} & \grntxt{$+11.2$} & \grntxt{$+10.3$} & \grntxt{$+5.2$} \\
\midrule
ViT-L & \xmark & $15.6 \pm 0.4$ & $11.3 \pm 0.9$ & $38.4 \pm 1.0$ & $36.8 \pm 0.8$ & $61.6 \pm 0.8$ \\
ViT-L & \cmark & $20.6 \pm 0.1$ & $30.4 \pm 0.5$ & $48.2 \pm 0.7$ & $46.0 \pm 0.4$ & $68.7 \pm 0.3$ \\
& & \grntxt{$+5.0$} & \grntxt{$+19.0$} & \grntxt{$+9.8$} & \grntxt{$+9.3$} & \grntxt{$+7.1$} \\
\midrule
Swin-Ti & \xmark & $16.2 \pm 0.4$ & $15.0 \pm 0.3$ & $36.0 \pm 0.8$ & $36.6 \pm 0.2$ & $65.5 \pm 0.4$ \\
Swin-Ti & \cmark & $18.3 \pm 0.3$ & $20.3 \pm 0.4$ & $41.4 \pm 0.8$ & $41.4 \pm 0.2$ & $68.2 \pm 0.4$ \\
& & \grntxt{$+2.2$} & \grntxt{$+5.4$} & \grntxt{$+5.4$} & \grntxt{$+4.8$} & \grntxt{$+2.7$} \\
\midrule
Swin-S & \xmark & $18.2 \pm 0.3$ & $19.4 \pm 0.3$ & $39.0 \pm 0.7$ & $39.1 \pm 0.2$ & $67.5 \pm 0.1$ \\
Swin-S & \cmark & $20.5 \pm 0.1$ & $27.7 \pm 0.4$ & $45.6 \pm 0.8$ & $44.1 \pm 0.3$ & $69.6 \pm 0.1$ \\
& & \grntxt{$+2.2$} & \grntxt{$+8.4$} & \grntxt{$+6.6$} & \grntxt{$+5.0$} & \grntxt{$+2.2$} \\
\bottomrule
\end{tabular}
}
\end{subfigure}
\hfill
\begin{subfigure}{.505\textwidth}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccccccc}
\toprule
Model & w/ \schemename & IN-Hard & IN-A & IN-C & IN-R & IN-V2 \\
\midrule
DeiT-S & \xmark & $19.5 \pm 0.2$ & $18.4 \pm 0.3$ & $58.8 \pm 0.7$ & $43.0 \pm 0.1$ & $68.8 \pm 0.2$ \\
DeiT-S & \cmark & $18.5 \pm 0.5$ & $17.3 \pm 1.0$ & $57.0 \pm 0.9$ & $43.8 \pm 0.2$ & $68.7 \pm 0.6$ \\
& & \rdtxt{$-1.0$} & \rdtxt{$-1.1$} & \rdtxt{$-1.8$} & \grntxt{$+0.8$} & \gtxt{$-0.1$} \\
\midrule
DeiT-B & \xmark & $22.6 \pm 0.2$ & $26.0 \pm 0.2$ & $62.1 \pm 1.0$ & $45.6 \pm 1.9$ & $70.6 \pm 0.9$ \\
DeiT-B & \cmark & $22.6 \pm 0.2$ & $25.0 \pm 0.3$ & $62.8 \pm 0.6$ & $47.7 \pm 0.8$ & $70.8 \pm 0.5$ \\
& & \gtxt{$\pm 0.0$} & \rdtxt{$-1.0$} & \grntxt{$+0.8$} & \grntxt{$+2.0$} & \gtxt{$+0.2$} \\
\midrule
DeiT-L & \xmark & $21.2 \pm 2.0$ & $20.2 \pm 3.4$ & $59.3 \pm 4.3$ & $41.3 \pm 2.7$ & $66.9 \pm 2.8$ \\
DeiT-L & \cmark & $23.4 \pm 0.3$ & $28.8 \pm 2.0$ & $63.4 \pm 0.7$ & $47.8 \pm 0.6$ & $71.6 \pm 0.5$ \\
& & \grntxt{$+2.2$} & \grntxt{$+8.7$} & \grntxt{$+4.1$} & \grntxt{$+6.5$} & \grntxt{$+4.7$} \\
\midrule
ResNet50 & \xmark & $16.1 \pm 0.2$ & $9.7 \pm 0.1$ & $38.0 \pm 1.0$ & $40.5 \pm 0.6$ & $66.8 \pm 0.4$ \\
ResNet50 & \cmark & $17.2 \pm 0.1$ & $10.8 \pm 0.4$ & $41.0 \pm 0.7$ & $43.7 \pm 0.3$ & $67.5 \pm 0.1$ \\
& & \grntxt{$+1.1$} & \grntxt{$+1.1$} & \grntxt{$+3.0$} & \grntxt{$+3.2$} & \grntxt{$+0.7$} \\
\midrule
ResNet101 & \xmark & $18.2 \pm 0.4$ & $14.3 \pm 0.1$ & $41.7 \pm 0.7$ & $42.3 \pm 0.1$ & $67.7 \pm 0.5$ \\
ResNet101 & \cmark & $19.9 \pm 0.2$ & $17.6 \pm 0.5$ & $46.3 \pm 0.6$ & $46.3 \pm 0.3$ & $69.5 \pm 0.3$ \\
& & \grntxt{$+1.7$} & \grntxt{$+3.2$} & \grntxt{$+4.6$} & \grntxt{$+4.0$} & \grntxt{$+1.8$} \\
\bottomrule
\end{tabular}
}
\end{subfigure}
\end{table}
\subsection{Bias and Robustness Evaluation}
% Additional to just using \name for training, its special properties and posibilities for adjustment of the data distribution make it a valuable tool for evaluating other model properties and biases.
Beyond its use for training, \schemename's unique properties and controlled data generation capabilities make it a powerful tool for analyzing behavior and biases of black-box models.
We exploit this in two complementary ways.
First, we ask whether \schemename-trained models are more robust on \emph{external} ImageNet robustness benchmarks that are not generated by our pipeline.
Second, we use \schemename's fine-grained control for targeted evaluation of specific dimensions of model bias, such as background reliance and center/size bias.
% Together, these experiments allow us to both \emph{probe} and \emph{improve} robustness along clearly defined axes.
% This combination of standard benchmarks and controlled probes allows us to both quantify robustness improvements and attribute them to changes in particular model behaviors.
\textbf{Robustness on External Distribution Shifts.}
\Cref{tab:robustness-datasets} summarizes accuracy on five widely used ImageNet robustness benchmarks: ImageNet-Hard~\cite{Taesiri2023}, ImageNet-A~\cite{Hendrycks2021}, ImageNet-C~\cite{Hendrycks2019}, ImageNet-R~\cite{Hendrycks2021a}, and ImageNetV2~\cite{Recht2019}.
Across ViTs, Swin Transformers, and ResNets, incorporating \schemename during training generally improves robustness to all considered distribution shifts.
For ViTs, the gains are substantial: for example, ViT-B improves from $15.8\%$ to $31.9\%$ accuracy on ImageNet-A ($+16.0$ p.p.) and from $40.4\%$ to $51.6\%$ on ImageNet-C ($+11.2$ p.p.), with similar improvements for ViT-S and ViT-L.
Swin also benefits consistently, with increases of roughly $2$--$8$ p.p. on most benchmarks, and ResNet sees smaller but steady gains (e.g., up to $+4.6$ points on ImageNet-C).
For DeiT, the picture is more nuanced: DeiT-B and DeiT-L still enjoy robustness improvements, whereas DeiT-S exhibits small decreases on several benchmarks.
Interestingly, however, ViT-S trained with \schemename outperforms the DeiT-S baseline.
This suggests that controlled composition can partially close the robustness gap between lightly and heavily regularized models.
Overall, the consistent improvements on corruption-based, natural and hard examples indicate that the compositional invariances induced by \schemename extend beyond the specific foreground/background manipulations used in its construction.
\begin{figure*}[t]
\centering
\includegraphics[width=\textwidth]{img/bg_robustness.pdf}
\caption{Evaluation of background robustness on ImageNet + \schemename, ImageNet9~\cite{Xiao2020} and CounterAnimal~\cite{Wang2024f}.
We plot the in-distribution (top of arrow) and the out-of-distribution (bottom of arrow) accuracy when training with and without \schemename.
We annotate each arrow with its length $\Delta$.
Training with \schemename improves the background robustness of all transformers by mostly boosting the out-of-distribution accuracy.
}
\label{fig:background-robustness}
\begin{figure*}
\centering
\includegraphics[width=.95\textwidth]{img/bg_robustness.pdf}
\caption{Evaluation of background robustness on ImageNet + \schemename, ImageNet9 and CounterAnimal.
We plot the in-distribution (top of arrow) and the out-of-distribution (bottom of arrow) accuracy when training with and without \schemename.
We annotate each arrow with its length $\Delta$.
Training with \schemename improves the background robustness of all transformers by mostly boosting the out-of-distribution accuracy.
}
\label{fig:background-robustness}
\end{figure*}
\textbf{Background Robustness.}
@@ -311,18 +377,19 @@ We assess the robustness of models to shifts in the background distribution from
% \text{Background Robustness} = \frac{\text{Acc}(\name_\text{all})}{\text{Acc}(\name_\text{same})}
% \end{align}
% It represents the relative drop in performance under a background distribution shift.
\Cref{fig:background-robustness} presents the background robustness results for three datasets: ImageNet with \schemename (all backgrounds vs. backgrounds of same class), ImageNet9~\cite{Xiao2020} (random backgrounds vs. original backgrounds), and CounterAnimal~\cite{Wang2024f} (counter vs. common background).
\Cref{fig:background-robustness} presents the background robustness results for three datasets: ImageNet with \schemename (all backgrounds vs. backgrounds of same class), ImageNet9 \cite{Xiao2020} (random backgrounds vs. original backgrounds), and CounterAnimal \cite{Wang2024f} (counter vs. common background).
The top triangle of each arrow represents the in-distribution backgrounds and the bottom triangle represents the out-of-distribution ones.
We follow ImageNet9 and CounterAnimal and assess the background robustness in terms of the accuracy gap when evaluating a model on images of normal background distribution compared to out-of-distribution backgrounds (length of each arrow; $\Delta$).
% When trained on ImageNet, smaller models generally exhibit greater robustness to changes in the background distribution than larger models and ResNet is more robust than the tested Transformer models.
Crucially, \schemename improves the background robustness of all models and across datasets, reducing the background-gap by boosting the performance on the out-of-background-distribution samples more than the in-distribution ones.
We find a similar trend for the Corner-Cases~\cite{Fatima2025} dataset (see supplementary), highlighting the generalization benefits of \schemename to unusual image compositions.
% to $\approx1.00$, meaning that these models are agnostic to the choice of background and only classify based on the foreground.
These findings highlight the generalization benefits of \schemename to unusual image compositions.
\begin{figure*}[t]
\centering
\includegraphics[width=\textwidth]{img/fg_focus.pdf}
\caption{Evaluation of the foreground focus (\Cref{eq:fg-focus}) using GradCam, GradCam++ and IntegratedGradients (IG) of models trained on ImageNet. Training with \schemename improves the foreground focus of almost all models.}
\label{fig:foreground-focus}
\begin{figure*}
\centering
\includegraphics[width=.95\textwidth]{img/fg_focus.pdf}
\caption{Evaluation of the foreground focus (\Cref{eq:fg-focus}) using GradCam, GradCam++ and IntegratedGradients (IG) of models trained on ImageNet. Training with \schemename improves the foreground focus of almost all models.}
\label{fig:foreground-focus}
\end{figure*}
\textbf{Foreground Focus.}
@@ -332,7 +399,7 @@ We can directly evaluate ImageNet-trained models, but this technique can also be
To evaluate the foreground focus, we employ Grad-CAM \cite{Selvaraju2016}, Grad-CAM++ \cite{Chattopadhay2018} and IntegratedGradients (IG) \cite{Sundararajan2017} to compute the per-pixel importance of an image for the model's prediction.
The foreground focus is defined to be the ratio of the foreground's relative importance to its relative size in the image:
\begin{align} \label{eq:fg-focus}
\text{FG Focus}(\text{img}) = \frac{\text{Area}(\text{img}) \hspace{3pt} \text{Importance}(\text{fg})}{\text{Area}(\text{fg}) \hspace{3pt} \text{Importance}(\text{img})}
\text{FG Focus}(\text{img}) = \frac{\text{Area}(\text{img}) \hspace{3pt} \text{Importance}(\text{fg})}{\text{Area}(\text{fg}) \hspace{3pt} \text{Importance}(\text{img})}
\end{align}
If all pixels uniformly receive the same importance value, the foreground focus is one.
The foreground focus of a model is its average focus over all test images.
@@ -345,59 +412,46 @@ We hypothesize Swin's below-uniform foreground focus with GradCam is due to its
% These differences might be due to the way GradCam is calculated for Swin \todo{cite package website where this is from} and the \todo{common critique of GradCam}.
\begin{table}[t]
\caption{
% Evaluation of the center bias.
Accuracy relative to the center accuracy of multiple instantiations of the models when the foreground objects is in different cells of a $3 \times 3$ grid.
We calculate center bias according to \Cref{eq:center-bias}.
Using \schemename significantly reduces models' center bias.}
\label{tab:center-bias}
\begin{subfigure}{.48\columnwidth}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& w/o \schemename & w/ \schemename \\
\midrule
ViT-S & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_ImageNet_v3.pdf} & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_RecombNet_all_v3.pdf} \\
& $25.5\pm0.8$ & $22.0\pm0.3$ & \grntxt{$-3.5$} \\
ViT-B & {\includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_ImageNet_v3.pdf}} & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_RecombNet_all_v3.pdf} \\
& $25.4\pm0.4$ & $19.0\pm0.2$ & \grntxt{$-6.4$} \\
ViT-L & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_ImageNet_v3.pdf} & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_RecombNet_all_v3.pdf} \\
& $24.3\pm1.1$ & $11.7\pm0.7$ & \grntxt{$-12.6$} \\
\midrule
Swin-Ti & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_RecombNet_all_v3.pdf}} \\
& $25.0\pm0.7$ & $16.5\pm0.2$ & \grntxt{$-8.5$} \\
Swin-S & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_RecombNet_all_v3.pdf}} \\
& $23.2\pm0.1$ & $15.6\pm0.2$ & \grntxt{$-7.6$} \\
\bottomrule
\end{tabular} }
\end{subfigure}
\hfill
\begin{subfigure}{.497\columnwidth}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& w/o \schemename & w/ \schemename \\
\midrule
DeiT-S & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_ImageNet_v3.pdf} } & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_fornet_all_linear_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_fornet_all_linear_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_fornet_all_linear_v3.pdf}} \\
& $20.4 \pm 0.2$ & $21.2 \pm 0.1$ & \gtxt{$+0.8$} \\
DeiT-B & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_ImageNet_v3.pdf} } & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_fornet_all_cos_v3.pdf}} \\
& $19.0 \pm 0.7$ & $19.0 \pm 0.2$ & \gtxt{$\pm0.0$} \\
DeiT-L & { \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_ImageNet_v3.pdf} } & { \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_fornet_all_cos_v3.pdf} } \\
& $21.2 \pm 0.2$ & $18.0 \pm 0.2$ & \grntxt{$-3.2$} \\
\midrule
ResNet50 & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_RecombNet_all_v3.pdf}} \\
& $26.3\pm0.3$ & $19.7\pm0.3$ & \grntxt{$-6.6$} \\
ResNet101 & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_RecombNet_all_v3.pdf}} \\
& $23.0\pm0.3$ & $19.9\pm0.2$ & \grntxt{$-3.1$} \\
\bottomrule
\end{tabular} }
\end{subfigure}
\centering
\includegraphics[width=.5\columnwidth]{img/colorbar_horizontal.pdf}
\caption{
% Evaluation of the center bias.
Accuracy relative to the center accuracy of multiple instantiations of the models when the foreground objects is in different cells of a $3 \times 3$ grid.
We calculate center bias according to \Cref{eq:center-bias}.
Using \schemename significantly reduces models' center bias.}
\label{tab:center-bias}
\centering
\resizebox{.78\columnwidth}{!}{
\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& w/o \schemename & w/ \schemename \\
\midrule
ViT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v3.pdf}} \\
& $25.5\pm0.8$ & $22.0\pm0.3$ & \grntxt{$-3.5$} \\
ViT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v3.pdf}} \\
& $25.4\pm0.4$ & $19.0\pm0.2$ & \grntxt{$-6.4$} \\
ViT-L & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v3.pdf}} \\
& $24.3\pm1.1$ & $11.7\pm0.7$ & \grntxt{$-12.6$} \\
\midrule
DeiT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v3.pdf}} \\
& $20.4 \pm 0.2$ & $21.2 \pm 0.1$ & \gtxt{$+0.8$} \\
DeiT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v3.pdf}} \\
& $19.0 \pm 0.7$ & $19.0 \pm 0.2$ & \gtxt{$\pm0.0$} \\
DeiT-L & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v3.pdf} } & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v3.pdf} } \\
& $21.2 \pm 0.2$ & $18.0 \pm 0.2$ & \grntxt{$-3.2$} \\
\midrule
Swin-Ti & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v3.pdf}} \\
& $25.0\pm0.7$ & $16.5\pm0.2$ & \grntxt{$-8.5$} \\
Swin-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v3.pdf}} \\
& $23.2\pm0.1$ & $15.6\pm0.2$ & \grntxt{$-7.6$} \\
\midrule
ResNet50 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v3.pdf}} \\
& $26.3\pm0.3$ & $19.7\pm0.3$ & \grntxt{$-6.6$} \\
ResNet101 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v3.pdf}} \\
& $23.0\pm0.3$ & $19.9\pm0.2$ & \grntxt{$-3.1$} \\
\bottomrule
\end{tabular} }
\includegraphics[width=.8\columnwidth]{img/colorbar_horizontal.pdf}
\end{table}
\textbf{Center Bias.}
@@ -413,23 +467,22 @@ The center bias is calculated as one minus the average of the minimum performanc
% \end{split}
% \end{align}
\begin{align} \label{eq:center-bias}
\text{Center Bias} = 1 - \frac{\min\limits_{c \in \text{sides}} \text{Acc}(c) + \min\limits_{c \in \text{corners}} \text{Acc}(c)}{2 \text{Acc}(c_\text{center})}
\text{Center Bias} = 1 - \frac{\min\limits_{c \in \text{sides}} \text{Acc}(c) + \min\limits_{c \in \text{corners}} \text{Acc}(c)}{2 \text{Acc}(c_\text{center})}
\end{align}
\Cref{tab:center-bias} visualizes the center bias of three instantiations of each model.
Performance is generally highest in the center and lowest in the four corners.
Interestingly, ImageNet-trained models perform slightly better when the foreground object is on the right side of the image, compared to the left side, despite our use of random flipping with a probability of $0.5$ during training.
% Training on \name reduces the center bias of all models by at least half.
Using \schemename significantly reduces center bias across models, with a more uniform performance especially across the middle row.
% On corner-cases (see supplementary) we find that
% Their accuracy is higher in the center left and right cells than in the center top and bottom ones, which is not the case for ImageNet-trained models.
% This demonstrates that \schemename promotes a more uniform spatial attention distribution, counteracting the center-bias of ImageNet.
Thus, \schemename makes the model recognize objects across a wider spatial distribution, counteracting the center-bias of ImageNet.
\begin{figure}[t!]
\centering
\includegraphics[width=\columnwidth]{img/size_bias_wide.pdf}
\caption{Evaluation of the size bias of models trained on ImageNet. We plot the accuracy relative to the accuracy when using the default size ($f_\text{size} = 1.0$).}
\label{fig:size-bias}
\centering
\includegraphics[width=\columnwidth]{img/size_bias_grid.pdf}
\caption{Evaluation of the size bias of models trained on ImageNet. We plot the accuracy relative to the accuracy when using the default size ($f_\text{size} = 1.0$).}
\label{fig:size-bias}
\end{figure}
\textbf{Size Bias.}
@@ -439,87 +492,6 @@ We introduce a size factor $f_\text{size}$ by which we additionally scale the fo
Results are normalized by the accuracy when using $f_\text{size} = 1.0$.
\Cref{fig:size-bias} shows the size bias curves of models trained with and without \schemename.
% When training on \name, the resulting model keeps it's good performance on smaller foreground objects, while models trained on ImageNet fall of faster and lower.
Models trained using \schemename perform better, especially with smaller foreground objects.
Models trained using \schemename maintain perform better, especially with smaller foreground objects.
%, when ImageNet-trained models exhibit a more rapid performance decline.
Therefore, \schemename-training improves robustness to variations in object scale, especially for larger models.
\subsection{Design Choices of \schemename}
We next analyze key components of \schemename, focusing on three questions: how it compares to simple copy-paste, how background choice affects performance, and how reliably labels are preserved after recomposition.
Additional ablations over variants and hyperparameters are provided in the supplementary material.
\begin{table}[t]
\caption{Comparison of \schemename and simple Copy-Paste methods. We train ViT-S on ImageNet using the same 3-augment data augmentation on top of the copy-paste augmentation.}
\label{tab:copy-paste-comparison}
\centering
\resizebox{.66\columnwidth}{!}{
\begin{tabular}{lcc S[table-format=+2.1,retain-explicit-plus,detect-inline-weight=math,detect-weight=true]}
\toprule
Augmentation & labels & \makecell{ Accuracy [\%]} & {\makecell{Delta \\to Prev.}} \\
\midrule
% Baseline & & $79.1 \pm 0.1$ \\
3-Augment + \textbf{Simple Copy-Paste} & bg & $31.3 \pm 0.6$ & \\
+ mixed labels & fg + bg & $32.0 \pm 0.8$ & +0.7 \\
+ fg labels & fg & $31.6 \pm 0.9$ & -0.4 \\
+ \emph{range} foreground size variation & \gtxt{fg} & $43.0 \pm 1.2$ & \bfseries +11.4 \\
+ infilled backgrounds & \gtxt{fg} & $68.7 \pm 0.2$ & \bfseries +25.7 \\
+ \emph{cos} mixing strategy & \gtxt{fg} & $81.2 \pm 0.1$ & \bfseries +12.5 \\
+ edge smoothing & \gtxt{fg} & $81.3 \pm 0.1$ & +0.1 \\
+ background pruning$=$ \textbf{\schemename} & \gtxt{fg} & $81.4 \pm 0.1$ & +0.1 \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Comparison to Simple Copy-Paste.}
We compare \schemename to a simple adaption of the Copy-Paste augmentation inspired by \cite{Ge2023,Ghiasi2021,Shermaine2025} in \Cref{tab:copy-paste-comparison}.
Contrary to semantic segmentation we do not have foreground masks available.
Thus, we paste the extracted objects from \textbf{\schemename's segmentation stage} onto normal ImageNet images.
% Since such images do not have straight forward classification labels, we test multiple possibilities.
We observe 3 large jumps in accuracy: (\textbf{1}) From our \emph{range} foreground size variation (+11.4\%), (\textbf{2}) from using our infilled backgrounds instead of images from the dataset (+25.7\%), and (\textbf{3}) from our \emph{cos} mixing strategy with non-augmented images (+12.5\%).
\schemename's changes to the naive copy-paste augmentation are thus imperative for good classification performance.
\begin{figure}[t]
\begin{minipage}[c]{.49\textwidth}
\centering
\includegraphics[width=\textwidth]{img/strategy.pdf}
\captionof{figure}{We compare Original, Same-class, and All-classes background selection using ViT-Ti and ViT-S backbones on TinyImageNet.
Increasing background diversity consistently improves classification accuracy.
}
\label{fig:background-strategy}
\end{minipage}
\hfill
\begin{minipage}[c]{.49\textwidth}
\centering
\includegraphics[width=\textwidth]{img/mask_expansion.pdf}
\captionof{figure}{
We vary the foreground mask area for TinyImageNet by shrinking or expanding masks relative to the original outline and report accuracy when training on $100\%$ augmented samples.
Performance is stable for expanded masks and degrades rapidly after shrinking masks.
}
\label{fig:mask-expansion}
\end{minipage}
\end{figure}
\textbf{Background Choice Strategy.}
\Cref{fig:background-strategy} shows the effect of background selection on TinyImageNet accuracy, where we trade off diversity against context plausibility.
% Using the original inpainted background yields the lowest accuracy, indicating limited regularization from contextual cues.
% Sampling backgrounds from the same class provides a modest but consistent improvement, suggesting that mild context variation encourages robustness while preserving semantic plausibility.
The best performance is achieved by sampling backgrounds from all classes, which introduces substantial context shifts, but leads to the strongest accuracy gains for both ViT-Ti and ViT-S.
Thus, aggressive background diversification is more important than context plausibility and acts as an effective form of context-based regularization rather than introducing harmful noise.
\textbf{Label Integrity.}
% We assess the label integrity of \schemename, i.e., whether object labels remain correct after recombination, by verifying that the intended object is accurately extracted.
% To this end, we leverage the object bounding box annotations provided in the ImageNet validation set.
% Specifically, we compute the \emph{box precision}, defined as the fraction of the predicted mask area that lies within the ground-truth bounding box, obtaining a mean value of $91\%$.
% In addition, we measure the \emph{box-to-box IoU}, computed as the IoU between the tight bounding box enclosing the predicted mask and the tight bounding box of the ground-truth annotation, which yields a high $76.1\%$.
% Qualitative examples of the predicted masks and bounding boxes are provided in the supplementary material.
% We additionally test label integrity under systematic mask perturbations by expanding or shrinking the foreground masks before composition.
% Concretely, starting from the original outline, we erode or dilate the mask such that the foreground area changes by some percentage.
% \Cref{fig:mask-expansion} shows that accuracy is relatively stable for expanded masks, but drops off significantly for eroded masks, consistent with cropping away semantically important object parts.
% This experiment suggests, that \schemename is relatively robust to artifacts from including an object's original background in the foreground mask.
% Overall, these results indicate that the segmentation stage of \schemename reliably isolates the target class object, thereby preserving label correctness after recombination.
To quantify whether recombined images still depict the intended class, we evaluate the segmentation stage of \schemename on ImageNet validation boxes.
Our predicted masks achieve a mean box precision of $91.0\%$ (fraction of mask area inside the ground-truth bounding boxes of the ImageNet validation set) and a high box-to-box IoU of $76.1\%$, indicating that they tightly capture the target object.
Qualitative examples of the predicted masks and bounding boxes are provided in the supplementary material.
We further probe robustness to mask imprecision by eroding or dilating masks such that the foreground area changes by a fixed percentage before composition.
As shown in \Cref{fig:mask-expansion}, accuracy remains stable for expansions but drops sharply under erosion, consistent with removing semantically important object parts.
Together, these results suggest that (\textit{i}) \schemename reliably isolates the target objects and preserves label integrity and that (\textit{ii}) \schemename is robust to artifacts from an object's original background and degrades mainly when the foreground no longer contains the full object.