ForAug/rebuttal.tex

\documentclass[10pt,twocolumn,letterpaper]{article}
\usepackage[rebuttal]{iccv}

% Include other packages here, before hyperref.
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}

% Import additional packages in the preamble file, before hyperref
\input{packages}

% If you comment hyperref and then uncomment it, you should delete
% egpaper.aux before re-running latex.  (Or just hit 'q' on the first latex
% run, let it finish, and you should be clear).
\definecolor{iccvblue}{rgb}{0.21,0.49,0.74}
\usepackage[pagebackref,breaklinks,colorlinks,allcolors=iccvblue]{hyperref}

% If you wish to avoid re-using figure, table, and equation numbers from
% the main paper, please uncomment the following and change the numbers
% appropriately.
%\setcounter{figure}{2}
%\setcounter{table}{1}
%\setcounter{equation}{2}

% If you wish to avoid re-using reference numbers from the main paper,
% please uncomment the following and change the counter value to the
% number of references you have in the main paper (here, 100).
%\makeatletter
%\apptocmd{\thebibliography}{\global\c@NAT@ctr 100\relax}{}{}
%\makeatother

%%%%%%%%% PAPER ID  - PLEASE UPDATE
\def\paperID{6426} % *** Enter the Paper ID here
\def\confName{ICCV}
\def\confYear{2025}

\newcommand{\rone}{\textbf{\textcolor{blue}{kCub}}}
\newcommand{\rtwo}{\textbf{\textcolor{red}{W3SS}}}
\newcommand{\rthree}{\textbf{\textcolor{ForestGreen}{5E96}}}

\begin{document}

\newcommand{\name}{\textit{ForNet}\xspace}
\newcommand{\schemename}{\textit{ForAug}\xspace}
% Names: RecombiNet, RecombNet, ReMix, ReMixNet, FoReMix/ForeMix

%%%%%%%%% TITLE - PLEASE UPDATE
\title{\schemename: Recombining Foregrounds and Backgrounds to Improve Vision Transformer Training with Bias Mitigation}


\maketitle
\thispagestyle{empty}
\appendix

We would like to sincerely thank the reviewers (\rone, \rtwo, \rthree) for their time and valuable feedback.
Below, we will address each of the reviewers points.
% Citations are those of the original manuscript.


% \begin{table}[h!]
%   \centering
%   \small
%   \begin{tabular}{lcccccc}
%     \toprule
%     \multirow{2.5}{*}{Model} & \multicolumn{3}{c}{ImageNet-9} & \multicolumn{3}{c}{CounterAnimal}                                                                                              \\
%     \cmidrule(r){2-4} \cmidrule(l){5-7}
%                              & same                           & rand                              & gap                       & common           & counter          & gap                      \\
%     \midrule
%     ViT-S/16 @ IN            & $85.86 \pm 1.47$               & $69.74 \pm 1.75$                  & $16.12$                   & $84.86 \pm 0.37$ & $69.27 \pm 0.39$ & $15.59$                  \\
%     ViT-S/16 @ FN            & $84.34 \pm 2.17$               & $73.74 \pm 1.92$                  & $10.61$ (\grntxt{-5.51})  & $88.37 \pm 0.46$ & $74.48 \pm 0.42$ & $13.89$ (\grntxt{-1.70}) \\
%     ViT-B/16 @ IN            & $86.24 \pm 0.67$               & $64.60 \pm 1.82$                  & $21.64$                   & $83.43 \pm 0.43$ & $66.56 \pm 0.66$ & $16.87$                  \\
%     ViT-B/16 @ FN            & $84.18 \pm 3.85$               & $73.59 \pm 6.34$                  & $10.59$ (\grntxt{-11.05}) & $88.21 \pm 0.61$ & $75.50 \pm 1.10$ & $12.71$ (\grntxt{-4.16}) \\
%     ViT-L/16 @ IN            & $88.56 \pm 0.50$               & $68.26 \pm 0.98$                  & $20.30$                   & $79.72 \pm 0.89$ & $60.57 \pm 1.10$ & $19.15$                  \\
%     ViT-L/16 @ FN            & $89.72 \pm 0.37$               & $77.29 \pm 1.85$                  & $12.44$ (\grntxt{-9.86})  & $87.78 \pm 0.07$ & $75.79 \pm 0.42$ & $11.99$ (\grntxt{-7.16}) \\
%     \midrule
%     Swin-Ti @ IN             & $91.61 \pm 0.30$               & $77.85 \pm 0.52$                  & $13.77$                   & $84.48 \pm 0.35$ & $69.03 \pm 0.50$ & $15.44$                  \\
%     Swin-Ti @ FN             & $93.34 \pm 0.55$               & $84.68 \pm 1.00$                  & $8.66$ (\grntxt{-5.11})   & $87.40 \pm 0.15$ & $74.04 \pm 0.05$ & $13.37$ (\grntxt{-2.07}) \\
%     Swin-S @ IN              & $90.89 \pm 0.40$               & $74.89 \pm 0.94$                  & $16.00$                   & $85.93 \pm 0.43$ & $71.81 \pm 0.59$ & $14.12$                  \\
%     Swin-S @ FN              & $93.28 \pm 0.66$               & $84.24 \pm 1.24$                  & $9.04$ (\grntxt{6.96})    & $88.52 \pm 0.54$ & $75.78 \pm 0.22$ & $12.75$ (\grntxt{-1.37}) \\
%     \midrule
%     resnet50 @ IN            & $24.09 \pm 0.50$               & $22.13 \pm 0.30$                  & $1.96$                    & $85.35 \pm 0.23$ & $68.10 \pm 0.59$ & $17.25$                  \\
%     resnet50 @ FN            & $26.82 \pm 0.24$               & $24.63 \pm 0.07$                  & $2.19$ (\rdtxt{+0.23})    & $86.16 \pm 0.16$ & $69.17 \pm 0.15$ & $17.00$ (\grntxt{-0.25}) \\
%     resnet101 @ IN           & $25.01 \pm 0.14$               & $23.21 \pm 0.31$                  & $1.80$                    & $86.35 \pm 0.28$ & $70.58 \pm 0.41$ & $15.77$                  \\
%     resnet101 @ FN           & $29.61 \pm 0.38$               & $27.37 \pm 0.44$                  & $2.24$ (\rdtxt{+0.44})    & $87.68 \pm 0.22$ & $73.74 \pm 0.50$ & $13.94$ (\grntxt{1.83})  \\
%     \bottomrule
%   \end{tabular}
%   \caption{ImageNet-9 and CounterAnimal results for models trained on ImageNet (IN) and ForegroundNet (FN). The numbers in parentheses indicate the difference to the IN trainded model.}
% \end{table}

\textbf{Reasoning and purpose of \schemename (\rone):}
% The primary purpose of \schemename is to explicitly encode desired invariances directly into the training data (L62) %,
% unlike traditional approaches which rely on model architectures (L56f),
% with the goal of enhancing model robustness and minimizing spurious correlations.
% Our rationale is that by systematically exposing the model to a wide yet controlled range of variations through data, we can make it inherently more robust and less reliant on spurious correlations.
% The goal is to enhance model robustness and minimize spurious correlations by systematically exposing models to controlled variations.
% \schemename's methodology was deliberately designed to achieve this purpose more effectively than simpler augmentation techniques.
% It allows for explicit control over varying position, size, and background independently and extensively and thus moves beyond minor alterations of existing images to generate novel, challenging, yet realistic training samples.
% \schemename's methodology offers explicit control over object position, size, and background and thus moves beyond minor alterations of existing images to generate novel, challenging, yet realistic training samples while maintaining label integrity.
% \schemename moves beyond minor alterations of existing images by controlling object position, size, and background to generate novel, challenging, yet realistic training samples while maintaining label integrity.
Traditional data augmentations are limited by existing image compositions, leading to biases where objects are centered and correlated with specific backgrounds.
\schemename aims to overcome these limitations by introducing object size, position, and background as independent, controllable degrees of freedom.
This approach explicitly exposes the model to a wider range of variations, actively reducing such compositional biases (see Tbls. 6, 8; Figs. 4, \ref{fig:bg-diff-results} (right)).
% Our approach (as detailed above) ensures label integrity despite complex augmentations.
% We show empirically that \schemename reduces model biases towards position, size, and background (Tbls. 6, 8; Figs. 4, \ref{fig:bg-diff-results} (right)).
%We show empirically that \schemename significantly reduces model bias towards position (Table 8), size (Figure 4), and background (Table 6 and \Cref{fig:bg-diff-results}).
Consequently, models trained with \schemename exhibit better performance specifically on these lower-likelihood images. % where differently trained would typically falter.
%In evaluation \schemename also serves as an effective analytical tool for measuring these biases in any pre-trained ImageNet model (Section 4.3).
Moreover, \schemename serves as an analytical tool for measuring biases in any ImageNet-trained model (Sec. 4.3).
%We acknowledge that the manuscript may not have focussed on this reasoning and analysis sufficiently and have expanded the discussion on the purpose and design choices of ForAug in the Introduction and Methodology sections.
%We also have enhanced Section 4.3 to more explicitly connect our experimental findings back to the core purpose of \schemename.
% Acknowledging the need for more clarity, we have expanded the Introduction and Methodology to further highlight the purpose and design, and revised Sec. 4.3 to more clearly connect experimental findings to \schemename's core purpose.
Acknowledging the need for more clarity, we have expanded Sec. 1 and 3 to further highlight the purpose and design, and revised Sec. 4.3 to more clearly connect experimental findings to \schemename's goals.


\textbf{Novelty of ForAug (\rone, \rtwo):}
% While inspired by Copy-Paste methods, \schemename makes distinct contributions by extending them to address the non-trivial challenges of classification.
%While we draw inspiration from Copy-Paste methods, we respectfully assert that \schemename introduces distinct and novel contributions by specifically tailoring and extending Copy-Paste to address the unique and non-trivial challenges of image classification.
While inspired by Copy-Paste methods, \schemename makes distinct contributions by %extending them to address
addressing the non-trivial challenges of classification, where we are successfully \textit{"automating the copy-paste augmentation with [...] solid empirical gains"} (\rthree).
%We would like to take the opportunity to emphasize some of the points that make \schemename novel:
% We would like to emphasize some of the points that make \schemename novel:
We want to emphasize some elements that make \schemename novel:
\textbf{(1)} Adapting copy-paste to image classification has only been tried by \footnote{\label{note:staug}J. -S. Kang and K. Chung, "STAug: Copy-Paste Based Image Augmentation Technique Using Salient Target," in IEEE Access, vol. 10, 2022} as an alternative to MixUp in a specialized domain.
%\textbf{(1)} We adapt copy-paste to the unique demands of image classification, which (to the best of our knowledge) has only been investigated by one other paper \footnote{\label{note:staug}J. -S. Kang and K. Chung, "STAug: Copy-Paste Based Image Augmentation Technique Using Salient Target," in IEEE Access, vol. 10, 2022} that utilizes a Copy-Paste-based augmentation as an alternative to MixUp in a specialized domain.
The scarcity of such methods suggests either \schemename's novelty, or the inherent difficulty in achieving successful application, thereby highlighting the novelty of our specific design choices. % and its demonstrated effectiveness in improving generalization and reducing biases.
% \textbf{(2)} We introduce novel strategies to overcome key challenges in adapting to classification, like label integrity.
% We generate plain background images by removing the main object. %from which the primary object of interest has been removed.
\textbf{(2)} We overcome key challenges in adapting to classification.
For label integrity we generate plain background images, removing the main object.
% When a new foreground object is pasted onto these plain backgrounds, a clear and unambiguous label for the resulting image can be assigned.
Pasting a new object onto these backgrounds allows for a clear, unambiguous label.
This approach, unlike \footref{note:staug} and previous Copy-Paste methods pasting onto existing dataset images, ensures clear training signals and reduces spurious background correlations.
\textbf{(3)} % The focus on bias-mitigation for transformer models.
\schemename incorporates large-scale position and size augmentations for the foregrounds
to encode these equivariances into the training data for bias-mitigation, a feature not utilized to the same extent in \footref{note:staug} or [11, 14, 55].
% This deliberate design choice encodes these equivariances into the training data, a feature not utilized to the same extent in \footref{note:staug} or [11, 14, 55].
% This targeted augmentation is key to the bias reduction results we demonstrate.


\textbf{Directly comparing to Copy-Paste (\rtwo):}
%A direct application of detection/segmentation Copy-Paste methods to classification is not straightforward, because of the following reasons:
Adapting Copy-Paste for classification brings several challenges, due to
%1. Standard Copy-Paste methods in detection/segmentation often depend on human-annotated foreground masks [14,28,41,53], which are generally not available for the large-scale datasets used in image classification.
\textbf{(1)} its dependence on human-annotated foreground masks [14,28,41,53], which are generally not available for large-scale datasets used in image classification.
%2. A fundamental difficulty when adapting Copy-Paste for image classification is determining the label of the augmented image.
\textbf{(2)} the difficulty in determining the augmented image's label.
%If a new foreground object is pasted onto an existing image, the correct classification label becomes ambiguous: should it be the label of the new object, the original background's object, a combined multi-label, or be determined by relative object size/area?
%This ambiguity is not present in detection/segmentation where instance/pixel labels are preserved.
%Thus, any "direct" application of Copy-Paste would rely on many choices making it more akin to a new method.
Pasting new foregrounds on existing dataset images creates label ambiguity (e.g., should the label derive from the new object, original, a multi-label?), unlike in segmentation where instance labels are preserved.
Thus, "directly" applying Copy-Paste requires many design choices, essentially leading to a novel method.


\begin{table}
  \centering
  \small
  \resizebox{.60\columnwidth}{!}{
    \begin{tabular}{lccccc}
      \toprule
      \multirow{2.5}{*}{Model} & DeiT       & \multicolumn{2}{c}{Ours (DeiT)} & \multicolumn{2}{c}{Ours (DeiT III)}                        \\
      \cmidrule(lr){3-4} \cmidrule(lr){5-6}
                               & original   & IN                              & FN                                  & IN   & FN            \\
      \midrule
      ViT-S                    & 79.8${}^*$ & 80.5                            & 80.3                                & 79.1 & \textbf{81.4} \\
      ViT-B                    & 81.8${}^*$ & 79.6                            & \textbf{81.5}                       & 77.6 & 81.1          \\
      Swin-S                   & -          & 82.2                            & \textbf{82.4}                       & 79.4 & 80.6          \\
      \bottomrule
    \end{tabular}}
  \caption{Results when training on ImageNet (IN) and \name (FN) using different data augmentation schemes. DeiT uses EMA${}^*$.}
  \label{tbl:deit-pipe}
\end{table}
\textbf{Using different training pipelines (\rtwo):}
% Thank you for pointing out that,
While \schemename improves accuracy, our data augmentation pipeline does indeed not reach the results from DeiT.
However, there is no reason to believe that \schemename does not work or improve the model performance when using another data augmentation or that this does make the comparison unfair, since the only difference between the results on ImageNet and \name is the inclusion of \schemename.
% Thus all the performance gains have to be attributed to \schemename.
% Thus all the performance gains come from \schemename.
% Nonetheless, we present our results when training using the DeiT-pipeline in \Cref{tbl:deit-pipe} for a subset of models (time constraints).
% We find that \schemename still improves the performance, especially of large transformers and will add the full set of results to the manuscript.
\cref{tbl:deit-pipe} (above) presents results for a subset of models (time constraints) using the DeiT-pipeline, finding that \schemename still improves performance, especially for larger transformers, with the added benefit of bias reduction.
Full results will be added to the manuscript.

\begin{figure}[t!]
  \centering
  \resizebox{.84\columnwidth}{!}{
    \includegraphics{../Diffusion MixUp/plots/rebuttal_bg_robustness.pdf}}
  %\caption{Accuracy ranges on ImageNet9 and CounterAnimal.
  %The top of the bars is the accuracy on the 'normal' background distribution, while the bottom is at the out of background distribution accuracy.
  % All results are the mean value of 3 training runs.
  %Training on \name (orange) instead of ImageNet (blue) always reduces the background gap (size of the bar) of transformers by significantly improving the out of distribution performance.}
  \caption{
    Results on ImageNet9 and CounterAnimal. Bars span from out-of-distribution (OOD, bottom) to normal backgrounds (top). \name (orange) significantly improves OOD performance compared to ImageNet (blue), reducing the accuracy gap (bar size).
  }
  \label{fig:bg-diff-results}
\end{figure}

\textbf{Background Robustness using other datasets (\rthree):}
While our metric (Eq. 4) was designed to mitigate \name-bias by comparing relative accuracy drops using the same recombination scheme, we agree and added the suggested benchmarks to the final manuscript (see \cref{fig:bg-diff-results} above).
%in both the numerator (original class backgrounds) and the denominator (all backgrounds), we fully agree that assessment on external benchmarks adds to a comprehensive evaluation.
%We provide the in- and out-of-distribution accuracy of our models on ImageNet9 and CounterAnimal in \Cref{fig:bg-diff-results}, which we will also add to the final manuscript.
These new results support our findings, as \name reduces the background gap of transformers by boosting OOD performance. %the performance on the out-of-distribution dataset.

\textbf{Additional compute and space costs (\rthree):}
%While there are additional computations needed for the online recombination of \schemename, these are outsourced to the CPU and heavily parallelized.
%When training ViT-B/16 on A100 GPUs we measure an average step-time of $528 \pm 2$ ms when training on ImageNet and $534 \pm 1$ ms for \name, an increase of $1\%$.
%Regarding disk space, the \name dataset requires 73GB, compared to 147GB for ImageNet.
We added a discussion to the manuscript.
With ViT-B/16 on ImageNet (A100), \schemename leads to a minor 1\% increase in average step-time ($528 \pm 2$ ms to $534 \pm 1$ ms) since the online recombination is CPU-outsourced and heavily parallelized.
% Regarding disk space, \name requires 73GB, whereas ImageNet requires 147GB.
\name requires 73GB of disk space, while ImageNet needs 147GB.


\end{document}