71 lines
12 KiB
TeX
71 lines
12 KiB
TeX
% !TeX root = ../main.tex
|
|
|
|
\section{Introduction}
|
|
\label{sec:intro}
|
|
|
|
% \begin{figure}
|
|
% \centering
|
|
% \includegraphics[width=.5\columnwidth]{img/fig-1.pdf}
|
|
% \caption{\schemename factorizes each training image into a foreground object and a background, then recombines them on the fly while controlling background identity, object position, and object scale. Standard, strong augmentations are applied afterwards.}
|
|
% \label{fig:fig-1}
|
|
% \end{figure}
|
|
\begin{table}[t]
|
|
\caption{Examples of \schemename generated images (center cropped) from ImageNet.
|
|
We successfully segment even multiple objects (\textit{Macaw}) and complex shapes (\textit{Cricket}).}
|
|
\label{tab:foraug-examples}
|
|
\centering
|
|
\resizebox{.9\textwidth}{!}{
|
|
\begin{tabular}{ccccc}
|
|
\toprule
|
|
Class & \makecell{Original \\Image} & \makecell{Extracted \\Foreground} & \makecell{Infilled \\Background} & Recombined Examples \\
|
|
\midrule
|
|
Macaw & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507_v1_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507_v1_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v12.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v18.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v3.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v4.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v6.JPEG} \\
|
|
% Conch & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v9.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v10.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v11.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v12.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v17.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v8.JPEG} \\
|
|
Cricket & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v0.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v10.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v16.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v2.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v6.JPEG} \\
|
|
Laptop & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v0.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v1.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v11.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v14.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v2.JPEG} \\
|
|
\bottomrule
|
|
\end{tabular}
|
|
}
|
|
\end{table}
|
|
|
|
|
|
Large-scale image classification is a central driver of modern computer vision: it benchmarks progress in computer vision~\cite{Khan2022,Rangel2024}, powers model pretraining~\cite{Dosovitskiy2021,Liu2021,Touvron2021b}, and yields representations that transfer broadly and underpin applications like medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2023a}, and object recognition~\cite{Carion2020,He2017,Girshick2014}.
|
|
However, classification supervision is weak in an important sense: the label does not specify \emph{how} the class-object should appear.
|
|
In ImageNet~\cite{Deng2009} for example, objects often occur at characteristic positions and scales and co-occur with correlated scene context~\cite{Fatima2025,Barbu2019}.
|
|
% In datasets such as ImageNet, objects often occur at characteristic positions and scales and co-occur with correlated scene context~\cite{Fatima2025,Barbu2019}.
|
|
As a result, models rely on shortcuts like background cues, center bias, or size bias, that boost in-distribution accuracy but hurt robustness and transfer~\cite{Geirhos2020,Fatima2025,Barbu2019}.
|
|
|
|
Here, data augmentation is the default defense.
|
|
Standard transformations (crop/flip/color jitter) and stronger policies such as MixUp~\cite{Zhang2018a}/CutMix~\cite{Yun2019} and automated augmentation search~\cite{Cubuk2019,Cubuk2020} expand appearance diversity~\cite{Shorten2019,Xu2023d}. % , yet they largely preserve the original \emph{composition} of each image~\cite{Shorten2019,Xu2023d}.
|
|
However, their ability to teach spatial and compositional invariances is limited.
|
|
This constraint matters especially for Vision Transformers (ViTs)~\cite{Dosovitskiy2021}: with weaker built-in spatial inductive biases than Convolutional Neural Networks (CNNs), ViTs must learn key equivariances (e.g., translation and scale robustness) primarily from data.
|
|
Copy-paste style augmentations~\cite{Ghiasi2021,Kang2022} alter composition more aggressively by overlaying segmented objects onto other images.
|
|
These are typically designed for detection or instance segmentation and rely on dense human annotations available for these tasks or use unconstrained dataset images as backgrounds.
|
|
As a result, they do not offer fine-grained control of object position and scale, and they do not explicitly enforce that the pasted background is semantically neutral, creating ambiguous labels for classification.
|
|
|
|
To encode compositional invariances directly in the training data, we propose \emph{Foreground-Background Augmentation} (\schemename), a controlled composition augmentation that \emph{explicitly factorizes each image into foreground and background, then recombines them for label-preserving, interpretable distribution shifts}.
|
|
Concretely, \schemename uses off-the-shelf segmentation and inpainting models to (i) extract a foreground object and synthesize a class-consistent, semantically neutral background, and (ii) paste the foreground onto diverse neutral backgrounds while controlling its position and scale (see \Cref{tab:foraug-examples}).
|
|
Unlike prior copy-paste methods that simply overlay objects onto arbitrary scenes~\cite{Ghiasi2021,Ghiasi2021,Kang2022}, \schemename first removes and neutralizes the original background, then samples from well-defined distributions of backgrounds, object positions, and object sizes.
|
|
This explicit factorization preserves a clean label for the recombined image while providing direct control over compositions, enabling us to break spurious correlations while still fitting seamlessly into modern strong augmentation pipelines. % (see \Cref{fig:fig-1}).
|
|
% Throughout, we apply \schemename on top of strong augmentation pipelines (RandAugment, Mixup, CutMix), so any gains are complementary to these widely used techniques.
|
|
% As it is important that any gains are complementary to strong augmentation pipelines (RandAugment, MixUp, CutMix), we apply \schemename on top of these widely used techniques.
|
|
To ensure that all gains are complementary to strong augmentation pipelines (RandAugment, MixUp, CutMix), we apply \schemename on top of these widely used techniques.
|
|
|
|
Empirically, \schemename yields consistent accuracy gains across architectures, improving ImageNet top-1 accuracy by up to 6 p.p. and fine-grained downstream accuracy by up to 7.3 p.p., and even improving transfer when ImageNet accuracy is matched.
|
|
Beyond accuracy, training with \schemename substantially improves robustness on standard distribution-shift benchmarks, where we observe gains of roughly $2-19$ p.p. across ViT, Swin, and ResNet architectures.
|
|
|
|
Finally, the same control knobs enable \schemename to become a targeted diagnostic tool of shortcut reliance and model robustness.
|
|
We quantify background reliance via controlled background swaps, and probe center and size biases through systematic position and scale sweeps, showing that training with \schemename reduces model biases.
|
|
|
|
\medskip
|
|
\noindent
|
|
\textbf{Contributions}
|
|
\begin{itemize}[topsep=0pt]
|
|
\item \textbf{Controlled composition augmentation for classification.}
|
|
We introduce \schemename, a foreground-background factorization and recombination scheme for image classification that creates label-preserving training samples with explicit control over background identity, object position, and object scale.
|
|
\item \textbf{Accuracy and transfer gains.}
|
|
Training with \schemename, in addition to standard strong augmentation pipelines, improves ImageNet top-1 accuracy by up to 6 p.p., boosts fine-grained downstream accuracy by up to 7.3 p.p. and increases accuracy on shifted distributions by up to $19$ p.p.
|
|
\item \textbf{Controlled bias diagnostics and mitigation.}
|
|
Using the same controls during evaluation, we measure background reliance, foreground focus, and position/scale biases through targeted distribution shifts.
|
|
\schemename systematically reduces shortcut behaviors and model biases.
|
|
\end{itemize} |