ForAug/sec/intro.tex

% !TeX root = ../main.tex

\section{Introduction}
\label{sec:intro}

% \begin{figure}
%       \centering
%       \includegraphics[width=.5\columnwidth]{img/fig-1.pdf}
%       \caption{\schemename factorizes each training image into a foreground object and a background, then recombines them on the fly while controlling background identity, object position, and object scale. Standard, strong augmentations are applied afterwards.}
%       \label{fig:fig-1}
% \end{figure}
\begin{table}[t]
      \caption{Examples of \schemename generated images (center cropped) from ImageNet.
            We successfully segment even multiple objects (\textit{Macaw}) and complex shapes (\textit{Cricket}).}
      \label{tab:foraug-examples}
      \centering
      \resizebox{.9\textwidth}{!}{
            \begin{tabular}{ccccc}
                  \toprule
                  Class   & \makecell{Original                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           \\Image} & \makecell{Extracted \\Foreground} & \makecell{Infilled \\Background} & Recombined Examples \\
                  \midrule
                  Macaw   & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507_v1_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507_v1_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v12.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v18.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v3.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v4.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v6.JPEG} \\
                  % Conch   & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070_fg.PNG}    & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070_bg.JPEG}    & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v9.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v10.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v11.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v12.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v17.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v8.JPEG} \\
                  Cricket & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170.JPEG}  & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170_fg.PNG}     & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170_bg.JPEG}     & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v0.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v10.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v16.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v2.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v6.JPEG}       \\
                  Laptop  & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615.JPEG}  & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615_fg.PNG}     & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615_bg.JPEG}     & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v0.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v1.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v11.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v14.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v2.JPEG}       \\
                  \bottomrule
            \end{tabular}
      }
\end{table}


Large-scale image classification is a central driver of modern computer vision: it benchmarks progress in computer vision~\cite{Khan2022,Rangel2024}, powers model pretraining~\cite{Dosovitskiy2021,Liu2021,Touvron2021b}, and yields representations that transfer broadly and underpin applications like medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2023a}, and object recognition~\cite{Carion2020,He2017,Girshick2014}.
However, classification supervision is weak in an important sense: the label does not specify \emph{how} the class-object should appear.
In ImageNet~\cite{Deng2009} for example, objects often occur at characteristic positions and scales and co-occur with correlated scene context~\cite{Fatima2025,Barbu2019}.
% In datasets such as ImageNet, objects often occur at characteristic positions and scales and co-occur with correlated scene context~\cite{Fatima2025,Barbu2019}.
As a result, models rely on shortcuts like background cues, center bias, or size bias, that boost in-distribution accuracy but hurt robustness and transfer~\cite{Geirhos2020,Fatima2025,Barbu2019}.

Here, data augmentation is the default defense.
Standard transformations (crop/flip/color jitter) and stronger policies such as MixUp~\cite{Zhang2018a}/CutMix~\cite{Yun2019} and automated augmentation search~\cite{Cubuk2019,Cubuk2020} expand appearance diversity~\cite{Shorten2019,Xu2023d}. % , yet they largely preserve the original \emph{composition} of each image~\cite{Shorten2019,Xu2023d}.
However, their ability to teach spatial and compositional invariances is limited.
This constraint matters especially for Vision Transformers (ViTs)~\cite{Dosovitskiy2021}: with weaker built-in spatial inductive biases than Convolutional Neural Networks (CNNs), ViTs must learn key equivariances (e.g., translation and scale robustness) primarily from data.
Copy-paste style augmentations~\cite{Ghiasi2021,Kang2022} alter composition more aggressively by overlaying segmented objects onto other images.
These are typically designed for detection or instance segmentation and rely on dense human annotations available for these tasks or use unconstrained dataset images as backgrounds.
As a result, they do not offer fine-grained control of object position and scale, and they do not explicitly enforce that the pasted background is semantically neutral, creating ambiguous labels for classification.

To encode compositional invariances directly in the training data, we propose \emph{Foreground-Background Augmentation} (\schemename), a controlled composition augmentation that \emph{explicitly factorizes each image into foreground and background, then recombines them for label-preserving, interpretable distribution shifts}.
Concretely, \schemename uses off-the-shelf segmentation and inpainting models to (i) extract a foreground object and synthesize a class-consistent, semantically neutral background, and (ii) paste the foreground onto diverse neutral backgrounds while controlling its position and scale (see \Cref{tab:foraug-examples}).
Unlike prior copy-paste methods that simply overlay objects onto arbitrary scenes~\cite{Ghiasi2021,Ghiasi2021,Kang2022}, \schemename first removes and neutralizes the original background, then samples from well-defined distributions of backgrounds, object positions, and object sizes.
This explicit factorization preserves a clean label for the recombined image while providing direct control over compositions, enabling us to break spurious correlations while still fitting seamlessly into modern strong augmentation pipelines. % (see \Cref{fig:fig-1}).
% Throughout, we apply \schemename on top of strong augmentation pipelines (RandAugment, Mixup, CutMix), so any gains are complementary to these widely used techniques.
% As it is important that any gains are complementary to strong augmentation pipelines (RandAugment, MixUp, CutMix), we apply \schemename on top of these widely used techniques.
To ensure that all gains are complementary to strong augmentation pipelines (RandAugment, MixUp, CutMix), we apply \schemename on top of these widely used techniques.

Empirically, \schemename yields consistent accuracy gains across architectures, improving ImageNet top-1 accuracy by up to 6 p.p. and fine-grained downstream accuracy by up to 7.3 p.p., and even improving transfer when ImageNet accuracy is matched.
Beyond accuracy, training with \schemename substantially improves robustness on standard distribution-shift benchmarks, where we observe gains of roughly $2-19$ p.p. across ViT, Swin, and ResNet architectures.

Finally, the same control knobs enable \schemename to become a targeted diagnostic tool of shortcut reliance and model robustness.
We quantify background reliance via controlled background swaps, and probe center and size biases through systematic position and scale sweeps, showing that training with \schemename reduces model biases.

\medskip
\noindent
\textbf{Contributions}
\begin{itemize}[topsep=0pt]
      \item \textbf{Controlled composition augmentation for classification.}
            We introduce \schemename, a foreground-background factorization and recombination scheme for image classification that creates label-preserving training samples with explicit control over background identity, object position, and object scale.
      \item \textbf{Accuracy and transfer gains.}
            Training with \schemename, in addition to standard strong augmentation pipelines, improves ImageNet top-1 accuracy by up to 6 p.p., boosts fine-grained downstream accuracy by up to 7.3 p.p. and increases accuracy on shifted distributions by up to $19$ p.p.
      \item \textbf{Controlled bias diagnostics and mitigation.}
            Using the same controls during evaluation, we measure background reliance, foreground focus, and position/scale biases through targeted distribution shifts.
            \schemename systematically reduces shortcut behaviors and model biases.
\end{itemize}