cvpr submission
This commit is contained in:
@@ -1,199 +1,343 @@
|
||||
% !TeX root = ../main.tex
|
||||
|
||||
|
||||
\begin{figure}[t]
|
||||
\begin{minipage}[t]{.62\textwidth}
|
||||
\captionof{table}{ImageNet results when training ViTs with different data augmentation pipelines.
|
||||
\schemename consistently improves performance in low- and mid-augmentation regimes and remains complementary to strong augmentation pipelines, with larger gains for larger models.
|
||||
}
|
||||
\label{tab:imagenet-pipelines}
|
||||
\centering
|
||||
\resizebox{\textwidth}{!}{
|
||||
\begin{tabular}{lccccc}
|
||||
\toprule
|
||||
\multirow{2.5}{*}{Augmentation} & \multirow{2.5}{*}{MixUp} & \multirow{2.5}{*}{CutMix} & \multicolumn{3}{c}{Accuracy [\%] using} \\
|
||||
\cmidrule(l){4-6}
|
||||
& & & ViT-S & ViT-B & ViT-L \\
|
||||
\midrule
|
||||
Basic & \xmark & \xmark & $71.9 \pm 0.1$ & $69.5 \pm 0.2$ & $68.3 \pm 0.4$ \\
|
||||
Basic + \schemename & \xmark & \xmark & $75.7 \pm 0.2$ & $75.5 \pm 0.6$ & $73.1 \pm 1.7$ \\
|
||||
& & & \grntxt{$+3.8$} & \grntxt{$+6.0$} & \grntxt{$+4.8$} \\
|
||||
\midrule
|
||||
RandAugment & \xmark & \xmark & $76.3 \pm 0.5$ & $75.5 \pm 0.2$ & $74.7 \pm 0.4$ \\
|
||||
RandAugment + \schemename & \xmark & \xmark & $78.0 \pm 0.1$ & $77.8 \pm 0.1$ & $78.0 \pm 0.6$ \\
|
||||
& & & \grntxt{$+1.7$} & \grntxt{$+2.3$} & \grntxt{$+3.3$} \\
|
||||
\midrule
|
||||
Basic & \cmark & \cmark & $79.8 \pm 0.3$ & $78.6 \pm 0.4$ & $78.1 \pm 1.6$ \\
|
||||
Basic + \schemename & \cmark & \cmark & $79.8 \pm 0.3$ & $81.6 \pm 0.5$ & $81.0 \pm 0.4$ \\
|
||||
& & & \gtxt{$\pm 0.0$} & \grntxt{$+3.0$} & \grntxt{$+2.9$} \\
|
||||
\midrule
|
||||
3-Augment & \xmark & \cmark & $79.1 \pm 0.1$ & $77.6 \pm 0.2$ & $75.3 \pm 0.4$ \\
|
||||
3-Augment + \schemename & \xmark & \cmark & $81.4 \pm 0.1$ & $81.1 \pm 0.4$ & $79.8 \pm 0.1$ \\
|
||||
& & & \grntxt{$+2.3$} & \grntxt{$+3.5$} & \grntxt{$+4.5$} \\
|
||||
\midrule
|
||||
RandAugment & \cmark & \cmark & $80.1 \pm 0.1$ & $81.9 \pm 0.3$ & $79.3 \pm 2.3$ \\
|
||||
RandAugment + \schemename & \cmark & \cmark & $80.0 \pm 0.3$ & $81.9 \pm 0.2$ & $82.4 \pm 0.1$ \\
|
||||
& & & \gtxt{$-0.1$} & \gtxt{$\pm 0.0$} & \grntxt{$+3.1$} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}[t]{.37\textwidth}
|
||||
\captionof{table}{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.}
|
||||
\label{tab:imagenet-results}
|
||||
\resizebox{\textwidth}{!}{\begin{tabular}{lccc}
|
||||
\toprule
|
||||
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
|
||||
\cmidrule(lr){2-3}
|
||||
& w/o \schemename & w/ \schemename & \\
|
||||
\midrule
|
||||
ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{$+2.3$} \\
|
||||
ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{$+3.5$} \\
|
||||
ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{$+4.5$} \\
|
||||
\midrule
|
||||
DeiT-S & $80.1 \pm 0.1$ & $80.0\pm0.3$ & \gtxt{$-0.1$} \\
|
||||
DeiT-B & $81.9 \pm 0.3$ & $81.9\pm0.2$ & \gtxt{$\pm0.0$} \\
|
||||
DeiT-L & $79.3\pm2.3$ & $82.4\pm0.1$ & \grntxt{$+3.1$} \\
|
||||
\midrule
|
||||
Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{$+1.8$} \\
|
||||
Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{$+1.2$} \\
|
||||
\midrule
|
||||
ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{$+0.5$} \\
|
||||
ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{minipage}
|
||||
\end{figure}
|
||||
|
||||
% \begin{table}[t]
|
||||
% \caption{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.}
|
||||
% \label{tab:imagenet-results}
|
||||
% \centering
|
||||
% \begin{subfigure}{.41\textwidth}
|
||||
% \resizebox{\textwidth}{!}{\begin{tabular}{lccc}
|
||||
% \toprule
|
||||
% \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
|
||||
% \cmidrule(lr){2-3}
|
||||
% & w/o \schemename & w/ \schemename & \\
|
||||
% \midrule
|
||||
% ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{$+2.3$} \\
|
||||
% ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{$+3.5$} \\
|
||||
% ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{$+4.5$} \\
|
||||
% \midrule
|
||||
% Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{$+1.8$} \\
|
||||
% Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{$+1.2$} \\
|
||||
% \bottomrule
|
||||
% \end{tabular}}
|
||||
% \end{subfigure}
|
||||
% \hspace{5pt}
|
||||
% \begin{subfigure}{.448\textwidth}
|
||||
% \resizebox{\textwidth}{!}{\begin{tabular}{lccc}
|
||||
% \toprule
|
||||
% \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
|
||||
% \cmidrule(lr){2-3}
|
||||
% & w/o \schemename & w/ \schemename & \\
|
||||
% \midrule
|
||||
% DeiT-S & $80.1 \pm 0.1$ & $80.0\pm0.3$ & \gtxt{$-0.1$} \\
|
||||
% DeiT-B & $81.9 \pm 0.3$ & $81.9\pm0.2$ & \gtxt{$\pm0.0$} \\
|
||||
% DeiT-L & $79.3\pm2.3$ & $82.4\pm0.1$ & \grntxt{$+3.1$} \\
|
||||
% \midrule
|
||||
% ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{$+0.5$} \\
|
||||
% ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\
|
||||
% \bottomrule
|
||||
% \end{tabular}}
|
||||
% \end{subfigure}
|
||||
% \end{table}
|
||||
|
||||
\section{Experiments}
|
||||
\label{sec:experiments}
|
||||
|
||||
% \begin{itemize}
|
||||
% \item [1.] Training on RecombiNet
|
||||
% \item ImageNet results (large)
|
||||
% \item Ablation (TinyImageNet): Foreground position
|
||||
% \item Ablation (TinyImageNet): Which background (or part of other ablation table?)
|
||||
% \item Ablation (TinyImageNet+ImageNet For edge blur): Design decisions: Which infill model, pruning threshold, p$\to$t /t$\to$p, foreground rotation range (?), edge blur, original image probability/schedule, Foreground size
|
||||
% \item With other Data Augmentations
|
||||
% \item [2.] More evalution metrics
|
||||
% \item Background accuracy (how to frame/sell? Background bias?) / Background robustness (= foreground with all background)?
|
||||
% \item Foreground focus
|
||||
% \item Position bias
|
||||
% \item Size bias
|
||||
% \end{itemize}
|
||||
|
||||
We conduct a comprehensive suit of experiments to validate the effectiveness of our approach,
|
||||
comparing ImageNet training with and without \schemename for 10 different models and 5 data augmentation pipelines.
|
||||
% We compare training on \name, the ImageNet instantiation of \schemename, to training on ImageNet for 10 different models.
|
||||
comparing ImageNet-training with and without \schemename for 10 different models.
|
||||
Furthermore, we assess the impact of using \schemename for pretraining on multiple fine-grained downstream datasets.
|
||||
Finally, we exploit \schemename's control over the image distribution to quantify model behaviors and biases.
|
||||
We always report the mean and standard deviation of three independent training runs.
|
||||
|
||||
\subsection{Image Classification Results}
|
||||
\subsection{Design Choices of ForAug}
|
||||
\label{sec:ablation}
|
||||
|
||||
\textbf{ImageNet training.}
|
||||
\Cref{tab:imagenet-pipelines} analyzes the effect of \schemename under different data augmentation pipelines:
|
||||
A \emph{basic} pipeline with RandomResizedCrop, Flip and ColorJitter, the \emph{3-Augment} pipeline from \cite{Touvron2022,Nauen2025} that also includes Grayscale, Solarization and GaussianBlur, as well as the widely used \emph{RandAugment}~\cite{Cubuk2020} based pipeline from DeiT~\cite{Touvron2021b}.
|
||||
Additionally, we include MixUp~\cite{Zhang2018a} and CutMix~\cite{Yun2019} augmentations.
|
||||
% We also include Mixup and CutMix.
|
||||
We find that the effectiveness of \schemename depends on the interplay between model capacity and baseline augmentation strength.
|
||||
When the baseline augmentation is weak or moderate, \schemename consistently improves ImageNet accuracy, with gains increasing for larger ViT models (up to $+6.0$ p.p.\ for ViT-B).
|
||||
As the augmentation pipeline becomes stronger (e.g., RandAugment with MixUp and CutMix), ImageNet improvements diminish for smaller models, indicating that the baseline augmentation already saturates their capacity.
|
||||
Importantly, even in cases where ImageNet accuracy does not improve, we consistently observe gains during downstream fine-tuning (see \Cref{tab:downstream-results}), suggesting that \schemename enhances representation quality beyond what is reflected by ImageNet accuracy.
|
||||
We start by ablating the design choices of \schemename on TinyImageNet~\cite{Le2015}, a subset of ImageNet containing 200 categories with 500 images each. %, and Tiny\name, the application of \schemename to TinyImageNet.
|
||||
% \Cref{tab:ablation} presents the results of these ablations.
|
||||
\Cref{tab:ablation-segment} presents ablations for segmentation and \Cref{tab:ablation-recombine} for recombination.
|
||||
|
||||
\Cref{tab:imagenet-results} additionally compares performance of different model architectures.
|
||||
ViT~\cite{Dosovitskiy2021}, Swin~\cite{Liu2021} and ResNet~\cite{He2016} (representing CNNs) are trained using the ``3-augment'' strategy, while DeiT~\cite{Touvron2021b} is trained using the ``RandAugment'' strategy.
|
||||
Notably, \schemename improves performance across all tested architectures, including the ResNet models, % (up to $1$ p.p.),
|
||||
demonstrating benefits beyond Transformers.
|
||||
% We find that \schemename's improvements counteract the drop in performance for increasing model sizes.
|
||||
% Without \schemename this drop is $3.8$ p.p. (ViT-S to L), while with \schemename it is reduced to $1.6$ p.p.
|
||||
% For DeiT there is a drop of $0.8$ p.p. from small to large while when using \schemename there is a \emph{gain} of $2.4$ p.p.
|
||||
\begin{table}
|
||||
\caption{Ablation of the design decisions in the segmentation phase of \schemename on TinyImageNet.
|
||||
The first line is our baseline, while the other lines are using \schemename.
|
||||
We use basic settings with the \emph{same} background strategy during recombination for this experiment.
|
||||
}
|
||||
\label{tab:ablation-segment}
|
||||
\centering
|
||||
\small
|
||||
\resizebox{.9\columnwidth}{!}{
|
||||
\begin{tabular}{cccc}
|
||||
\toprule
|
||||
\multirow{2.5}{*}{\makecell{Detect. \\Prompt}} & \multirow{2.5}{*}{\makecell{Infill \\ Model}} & \multicolumn{2}{c}{TinyImageNet Accuracy [\%]} \\
|
||||
\cmidrule{3-4}
|
||||
& & ViT-Ti & ViT-S \\
|
||||
\midrule
|
||||
\multicolumn{2}{l}{\textbf{TinyImageNet}} & $66.1 \pm 0.5$ & $68.3 \pm 0.7$ \\
|
||||
specific & LaMa \cite{Suvorov2021} & $65.5 \pm 0.4$ & $71.2 \pm 0.5$ \\
|
||||
general & \gtxt{LaMa \cite{Suvorov2021}} & $66.4 \pm 0.6$ & $72.9 \pm 0.6$ \\
|
||||
\gtxt{general} & Att. Eraser \cite{Sun2024} & $67.5 \pm 1.2$ & $72.4 \pm 0.5$ \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[t]
|
||||
\caption{Downstream accuracy in percent when finetuning on other datasets. Models are pretrained on ImageNet with and without \schemename. Pretraining using \schemename increases transformer downstream accuracy.
|
||||
% on all datasets.
|
||||
}
|
||||
\label{tab:downstream-results}
|
||||
\begin{subfigure}{.48\columnwidth}
|
||||
\resizebox{\textwidth}{!}{\begin{tabular}{lcccccc}
|
||||
\toprule
|
||||
Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\
|
||||
\midrule
|
||||
ViT-S & \xmark & $72.4\pm1.0$ & $89.8\pm0.3$ & $94.5\pm0.2$ & $89.1\pm0.1$ & $93.8\pm0.2$ \\
|
||||
ViT-S & \cmark & $78.6\pm0.5$ & $92.2\pm0.2$ & $95.5\pm0.2$ & $89.6\pm0.1$ & $94.5\pm0.2$ \\
|
||||
& & \grntxt{$+6.2$} & \grntxt{$+2.4$} & \grntxt{$+1.0$} & \grntxt{$+0.5$} & \grntxt{$+0.7$} \\
|
||||
\midrule
|
||||
ViT-B & \xmark & $71.7\pm0.5$ & $90.0\pm0.2$ & $94.8\pm0.4$ & $89.8\pm0.2$ & $94.1\pm0.4$ \\
|
||||
ViT-B & \cmark & $79.0\pm2.2$ & $93.3\pm0.1$ & $ 96.5\pm0.1$ & $90.9\pm0.1$ & $95.1\pm0.4$ \\
|
||||
& & \grntxt{$+7.3$} & \grntxt{$+3.3$} & \grntxt{$+1.7$} & \grntxt{$+1.1$} & \grntxt{$+1.0$} \\
|
||||
\midrule
|
||||
ViT-L & \xmark & $72.1\pm1.0$ & $88.8\pm0.3$ & $94.4\pm0.3$ & $90.1\pm0.2$ & $94.2\pm0.4$ \\
|
||||
ViT-L & \cmark & $77.6\pm1.2$ & $89.1\pm0.2$ & $96.6\pm0.1$ & $91.3\pm0.1$ & $95.1\pm0.1$ \\
|
||||
& & \grntxt{$+5.5$} & \grntxt{$+0.3$} & \grntxt{$+2.2$} & \grntxt{$+1.2$} & \grntxt{$+0.9$} \\
|
||||
\midrule
|
||||
Swin-Ti & \xmark & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\
|
||||
Swin-Ti & \cmark & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\
|
||||
& & \grntxt{$+4.1$} & \grntxt{$+2.5$} & \grntxt{$+0.3$} & \grntxt{$+0.4$} & \grntxt{$+0.6$} \\
|
||||
\midrule
|
||||
Swin-S & \xmark & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\
|
||||
Swin-S & \cmark & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\
|
||||
& & \grntxt{$+5.7$} & \grntxt{$+2.1$} & \grntxt{$+1.4$} & \gtxt{$+0.1$} & \grntxt{$+0.5$} \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}{.505\columnwidth}
|
||||
\resizebox{\textwidth}{!}{\begin{tabular}{lcccccc}
|
||||
\toprule
|
||||
Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\
|
||||
\midrule
|
||||
DeiT-S & \xmark & $75.3\pm0.4$ & $91.1\pm0.2$ & $94.8\pm0.4$ & $89.2\pm0.2$ & $92.4\pm0.2$ \\
|
||||
DeiT-S & \cmark & $76.8\pm0.8$ & $91.9\pm0.2$ & $95.2\pm0.3$ & $89.1\pm0.2$ & $92.3\pm0.4$ \\
|
||||
& & \grntxt{$+1.5$} & \grntxt{$+0.8$} & \grntxt{$+0.4$} & \gtxt{$-0.1$} & \gtxt{$-0.1$} \\
|
||||
\midrule
|
||||
DeiT-B & \xmark & $77.0\pm1.2$ & $92.9\pm0.2$ & $96.1\pm0.2$ & $91.2\pm0.1$ & $93.3\pm0.4$ \\
|
||||
DeiT-B & \cmark & $79.3\pm0.3$ & $93.1\pm0.1$ & $96.4\pm0.2$ & $91.3\pm0.1$ & $93.3\pm0.1$ \\
|
||||
& & \grntxt{$+2.3$} & \gtxt{$+0.2$} & \grntxt{$+0.3$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
|
||||
\midrule
|
||||
DeiT-L & \xmark & $72.8\pm5.5$ & $92.8\pm1.0$ & $95.8\pm1.5$ & $90.5\pm2.6$ & $92.4\pm2.0$ \\
|
||||
DeiT-L & \cmark & $78.8\pm0.8$ & $93.8\pm0.2$ & $97.0\pm0.2$ & $92.0\pm0.2$ & $93.5\pm0.2$ \\
|
||||
& & \grntxt{$+6.0$} & \grntxt{$+1.0$} & \grntxt{$+1.2$} & \grntxt{$+1.5$} & \grntxt{$+1.1$} \\
|
||||
\midrule
|
||||
ResNet-50 & \xmark & $78.2\pm0.5$ & $89.8\pm0.2$ & $91.7\pm0.4$ & $84.4\pm0.2$ & $93.7\pm0.3$ \\
|
||||
ResNet-50 & \cmark & $80.3\pm0.4$ & $90.4\pm0.2$ & $91.7\pm0.2$ & $84.5\pm0.2$ & $93.7\pm0.3$ \\
|
||||
& & \grntxt{$+2.1$} & \grntxt{$+0.6$} & \gtxt{$\pm0.0$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
|
||||
\midrule
|
||||
ResNet-101 & \xmark & $78.4\pm0.6$ & $90.3\pm0.1$ & $91.2\pm0.5$ & $86.0\pm0.2$ & $94.3\pm0.2$ \\
|
||||
ResNet-101 & \cmark & $81.4\pm0.5$ & $91.3\pm0.1$ & $92.9\pm0.2$ & $86.3\pm0.1$ & $94.0\pm0.3$ \\
|
||||
& & \grntxt{$+3.0$} & \grntxt{$+1.3$} & \grntxt{$+1.7$} & \grntxt{$+0.3$} & \textcolor{red}{$-0.3$} \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{subfigure}
|
||||
\caption{Ablation of the recombination phase of \schemename on TinyImageNet (top) and ImageNet (bottom). The first experiments use the initial segmentation settings with LaMa \cite{Suvorov2021}.}
|
||||
\label{tab:ablation-recombine}
|
||||
\centering
|
||||
\resizebox{\columnwidth}{!}{
|
||||
\begin{tabular}{ccccccccccc}
|
||||
\toprule
|
||||
% FG. & Augment. & BG. & BG. & Edge & Original & \multicolumn{2}{c}{Accuracy [\%]} \\
|
||||
% Size & Order & Strat. & Prune & Smoothing & Mixing & ViT-Ti & ViT-S \\
|
||||
\multirow{2.5}{*}{\makecell{FG. \\size}} & \multirow{2.5}{*}{\makecell{Augment.\\Order}} & \multirow{2.5}{*}{\makecell{BG\\Strat.}} & \multirow{2.5}{*}{\makecell{BG.\\Prune}} & \multirow{2.5}{*}{\makecell{Original\\Mixing}} & \multirow{2.5}{*}{\makecell{Edge\\Smooth.}} & \multicolumn{2}{c}{Accuracy [\%]} \\
|
||||
\cmidrule{7-8}
|
||||
& & & & & & ViT-Ti & ViT-S \\
|
||||
\midrule
|
||||
% TinyImageNet & & & & & & & $66.1\pm0.5$ & $68.3\pm0.7$ \\
|
||||
\multicolumn{6}{l}{\textbf{TinyImageNet}} & \gtxt{$66.1\pm0.5$} & \gtxt{$68.3\pm0.7$} \\
|
||||
mean & crop$\to$paste & same & - & - & \gtxt{-} & $64.6\pm0.5$ & $70.0\pm0.6$ \\
|
||||
range & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $65.5\pm0.4$ & $71.2\pm0.5$ \\
|
||||
\midrule
|
||||
% \gtxt{range} & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $66.4\pm0.6$ & $72.9\pm0.6$ \\
|
||||
{range} & {crop$\to$paste} & {same} & {-} & {-} & {-} & $67.5\pm1.2$ & $72.4\pm0.5$ \\
|
||||
\gtxt{range} & paste$\to$crop & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.1\pm1.2$ & $72.9\pm0.5$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 1.0 & \gtxt{-} & \gtxt{-} & $67.0\pm1.2$ & $73.0\pm0.3$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.8 & \gtxt{-} & \gtxt{-} & $67.2\pm1.2$ & $72.9\pm0.8$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.6 & \gtxt{-} & \gtxt{-} & $67.5\pm1.0$ & $72.8\pm0.7$ \\
|
||||
% \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 2.0$ & \gtxt{-} & $67.2\pm0.4$ & $72.9\pm0.5$ \\
|
||||
% \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{-} & $65.9\pm0.5$ & $72.4\pm0.6$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.2$ & \gtxt{-} & $69.8\pm0.5$ & $75.0\pm0.3$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.33$ & \gtxt{-} & $69.5\pm0.4$ & $75.2\pm1.0$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.5$ & \gtxt{-} & $70.3\pm1.0$ & $74.2\pm0.2$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & linear & \gtxt{-} & $70.1\pm0.7$ & $74.9\pm0.8$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & reverse lin. & \gtxt{-} & $67.6\pm0.2$ & $73.2\pm0.3$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & cos & \gtxt{-} & $71.3\pm1.0$ & $75.7\pm0.8$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & $70.0\pm0.8$ & $75.5\pm0.7$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & orig. & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $67.2\pm0.9$ & $69.9\pm1.0$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $70.1\pm0.7$ & $77.5\pm0.6$ \\
|
||||
\midrule
|
||||
\multicolumn{6}{l}{\textbf{ImageNet}} & \gtxt{-} & \gtxt{$79.1\pm0.1$} \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & \gtxt{-} & - & $80.5\pm0.1$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & - & $80.7\pm0.1$ \\
|
||||
\gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & - & $81.4\pm0.1$ \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
|
||||
|
||||
\textbf{Prompt.}
|
||||
% We present the ablation of our main design decisions in \Cref{tab:ablation}.
|
||||
First, we evaluate the type of prompt used to detect the foreground object.
|
||||
Here, the \emph{general} prompt, which contains the class and the more general object category, outperforms only having the class name (\emph{specific}).
|
||||
|
||||
\textbf{Inpainting.} Among inpainting models, Attentive Eraser~\cite{Sun2024} produces slightly better results compared to LaMa~\cite{Suvorov2021} ($+0.5$ p.p. on average).
|
||||
For inpainting examples, see the supplementary material.
|
||||
% (see the supplementary material for examples).
|
||||
% When comparing the infill models, the GAN-based LaMa \cite{Suvorov2021} gets outperformed by the Attentive Eraser \cite{Sun2024}.
|
||||
|
||||
\textbf{Foreground size}
|
||||
% We observe that LaMa's often infills unnatural textures compared to Attentive Eraser.
|
||||
% The size of foreground objects during training has a significant impact on the performance.
|
||||
% Here, using the greater variability of the \emph{range} strategy increases the performance by $\approx 1\%$ compared to the \emph{mean} strategy.
|
||||
significantly impacts performance.
|
||||
Employing a \emph{range} of sizes during recombination, rather than a fixed \emph{mean} size, boosts accuracy by approximately 1 p.p.
|
||||
This suggests that the added variability is beneficial.
|
||||
|
||||
\textbf{Order of data augmentation.}
|
||||
% (1) Applying the image crop related augmentations \emph{before} pasting the foreground object and the color-based ones \emph{after} pasting or (2) applying all data augmentations after pasting the foreground object.
|
||||
% While results are ambiguous, we choose the second strategy, as it improves the performance of ViT-S, although not the one of ViT-Ti.
|
||||
Applying all augmentations after foreground-background recombination (\emph{paste$\to$crop$\to$color}) improves ViT-S's performance compared to applying crop-related augmentations before pasting (\emph{crop$\to$paste$\to$color}).
|
||||
ViT-Ti results are ambiguous.
|
||||
|
||||
\textbf{Background pruning.}
|
||||
When it comes to the backgrounds to use, we test different pruning thresholds ($t_\text{prune}$) to exclude backgrounds with large inpainting.
|
||||
% and only use backgrounds with an relative size of the infilled region of at most $t_\text{prune}$ (exclusive).
|
||||
A threshold of $t_\text{prune}=1.0$ means that we use all backgrounds that are not fully infilled.
|
||||
% We find that the background pruning does not significantly impact the models' performance.
|
||||
% We choose $t_\text{prune}=0.8$ for the following experiments to exclude backgrounds that are mostly artificial.
|
||||
Varying $t_\text{prune}$ has minimal impact.
|
||||
We choose $t_\text{prune} = 0.8$ to exclude predominantly artificial backgrounds.
|
||||
|
||||
% One of the most important design decisions is the mixing of the original dataset with \name.
|
||||
\textbf{Mixing} \schemename-augmented samples with the original ImageNet data proves crucial.
|
||||
While constant and linear mixing schedules improve performance over no mixing by $2-3$ p.p. compared to only augmented samples, the cosine annealing schedule proves optimal, boosting accuracy by $3-4$ p.p.
|
||||
|
||||
\textbf{Edge smoothing.}
|
||||
We evaluate the impact of using Gaussian blurring to smooth the edges of the foreground masks.
|
||||
% Similarly, applying edge smoothing to foreground masks with Gaussian blurring actually hurts performance on Tiny\name, but slightly improves it on \name.
|
||||
For larger models, this gives us a slight performance boost on the full ImageNet (second to last line in \Cref{tab:ablation-recombine}).
|
||||
|
||||
\textbf{Background strategy.}
|
||||
Another point is the allowed choice of background image for each foreground object.
|
||||
% We evaluate three different strategies.
|
||||
% (1) Picking the background from which that specific foreground was originally extracted.
|
||||
% The major difference to ImageNet when using this setup is the variability in size and position of the foreground object.
|
||||
% (2) Picking a background that originally had a foreground object of the same class in it.
|
||||
% Here, we have backgrounds where objects of this type can typically appear while also creating a wider variety of samples due to pairing each foreground object with different backgrounds each time.
|
||||
% (3) Picking any background.
|
||||
% This choice has the largest variety of backgrounds, but the backgrounds are not semantically related to the foreground object anymore.
|
||||
% We find in \Cref{fig:bg-strategy} that choosing only a foreground's original background is the worst choice.
|
||||
We compare using the original background, a background from the same class, and any background.
|
||||
These strategies go from low diversity and high shared information content between the foreground and background to high diversity and low shared information content.
|
||||
For \emph{ViT-Ti}, the latter two strategies perform comparably, while \emph{ViT-S} benefits from the added diversity of using any background.
|
||||
The same is true when training on the full ImageNet.
|
||||
|
||||
|
||||
\begin{table}
|
||||
\caption{Accuracy of ViT-S on TinyImageNet (TIN) in percent using \schemename with different foreground position distributions by varying the Bates parameter $\eta$.
|
||||
The best performance is achieved when using the uniform distribution ($\eta=1$) for training.}
|
||||
\label{tbl:foreground-eta}
|
||||
\centering
|
||||
\small
|
||||
\resizebox{.9\columnwidth}{!}{
|
||||
\begin{tabular}{ccccccc}
|
||||
\toprule
|
||||
\multirow{2.5}{*}{\makecell{Bates Parameter \\during training}} & \multirow{2.5}{*}{\makecell{TIN \\w/o \schemename}} & \multicolumn{5}{c}{TIN w/ \schemename} \\
|
||||
\cmidrule(l){3-7}
|
||||
& & $\eta=-3$ & $-2$ & $1/-1$ & $2$ & $3$ \\
|
||||
\midrule
|
||||
Baseline & 68.9 & 60.5 & 60.2 & 60.8 & 62.6 & 63.1 \\
|
||||
$\eta=-3$ & 71.3 & 79.3 & 79.5 & 79.1 & 79.3 & 79.1 \\
|
||||
$\eta=-2$ & 71.5 & 80.0 & 78.7 & 79.3 & 79.1 & 78.8 \\
|
||||
$\eta=1/-1$ & 72.3 & 79.5 & 78.9 & 80.2 & 79.7 & 80.4 \\
|
||||
$\eta=2$ & 71.3 & 78.2 & 77.8 & 79.1 & 79.6 & 79.9 \\
|
||||
$\eta=3$ & 71.4 & 77.2 & 76.9 & 78.6 & 79.6 & 79.7 \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
|
||||
\textbf{Foreground position.}
|
||||
Finally, we analyze the foreground object's positioning in the image, using a
|
||||
generalization of the Bates distribution~\cite{Bates1955} with parameter $\eta \in \Z$.
|
||||
The Bates distribution presents an easy way to sample from a bounded domain with just one hyperparameter that controls its concentration.
|
||||
$\eta = 1/-1$ corresponds to the uniform distribution; $\eta > 1$ concentrates the distribution around the center; and for $\eta < -1$, the distribution is concentrated at the borders (see supplementary material for details).
|
||||
% We utilize an extended Bates distribution to sample the position of the foreground object.
|
||||
% The Bates distribution with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}.
|
||||
% The larger $\eta$, the more concentrated the distribution is at the center, $\eta < -1$ concentrates the distribution at the edges.
|
||||
% We extend this concept to $\eta \leq -1$, shifting the distribution away from the center and towards the edges.
|
||||
When sampling more towards the center of the image, the difficulty of the task is reduced, which reduces performance on TinyImageNet (\Cref{tbl:foreground-eta}).
|
||||
This is reflected in the performance when evaluating using \schemename with $\eta=2$ and $\eta=3$ compared to $\eta=-1/1$.
|
||||
We observe a similar reduction for $\eta < -1$.
|
||||
% This experiment is conducted using the LaMa infill model.
|
||||
|
||||
\begin{table}
|
||||
\caption{Dataset statistics for TinyImageNet and ImageNet with and without \schemename. For \schemename we report the number of foreground/background pairs.}
|
||||
\label{tab:dataset-stats}
|
||||
\centering
|
||||
\resizebox{.9\columnwidth}{!}{
|
||||
\begin{tabular}{l S[table-format=4.0] S[table-format=7.0] S[table-format=5.0]}
|
||||
\toprule
|
||||
Dataset & {Classes} & {\makecell{Training \\ Images}} & {\makecell{Validation \\ Images}} \\
|
||||
\midrule
|
||||
TinyImageNet & 200 & 100000 & 10000 \\
|
||||
TinyImageNet + \schemename & 200 & 99404 & 9915 \\
|
||||
ImageNet & 1000 & 1281167 & 50000 \\
|
||||
ImageNet + \schemename & 1000 & 1274557 & 49751 \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
After fixing the optimal design parameters in \Cref{tab:ablation-segment,tab:ablation-recombine} (last rows), we run \schemename's segmentation step on the entire ImageNet dataset.
|
||||
\Cref{tab:dataset-stats} shows the resulting dataset statistics.
|
||||
% The slightly lower number of images in \name is due to \emph{Grounded SAM} returning no or invalid detections for some images.
|
||||
The slightly reduced image count for \schemename is due to instances where Grounded SAM fails to produce valid segmentation masks.
|
||||
|
||||
|
||||
\subsection{Image Classification Results}
|
||||
|
||||
\begin{table}
|
||||
\caption{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.}
|
||||
\label{tab:imagenet-results}
|
||||
\centering
|
||||
\small
|
||||
\resizebox{.8\columnwidth}{!}{\begin{tabular}{lccc}
|
||||
\toprule
|
||||
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
|
||||
\cmidrule(lr){2-3}
|
||||
& w/o \schemename & w/ \schemename & \\
|
||||
\midrule
|
||||
ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{$+2.3$} \\
|
||||
ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{$+3.5$} \\
|
||||
ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{$+4.5$} \\
|
||||
\midrule
|
||||
DeiT-S & $80.1 \pm 0.1$ & $80.0\pm0.3$ & \gtxt{$-0.1$} \\
|
||||
DeiT-B & $81.9 \pm 0.3$ & $81.9\pm0.2$ & \gtxt{$\pm0.0$} \\
|
||||
DeiT-L & $79.3\pm2.3$ & $82.4\pm0.1$ & \grntxt{$+3.1$} \\
|
||||
\midrule
|
||||
Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{$+1.8$} \\
|
||||
Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{$+1.2$} \\
|
||||
\midrule
|
||||
ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{$+0.5$} \\
|
||||
ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
|
||||
\Cref{tab:imagenet-results} compares the ImageNet performance of models trained with and without \schemename.
|
||||
We adopt the training setup of \cite{Nauen2025} and \cite{Touvron2022} for training ViT \cite{Dosovitskiy2021}, Swin \cite{Liu2021} and ResNet \cite{He2016} (representing CNNs) models as well as the setup of DeiT \cite{Touvron2021b} for that model.
|
||||
Both setups are using strong data augmentations like RandAugment, CutMix, and Mixup optimized for Transformers (details in supplementary material).
|
||||
Notably, \schemename improves performance across all tested architectures, including the ResNet models, % (up to $1$ p.p.),
|
||||
demonstrating benefits beyond Transformers.
|
||||
For DeiT we only observe benefits on ImageNet for the larger models.
|
||||
For other transformers, we observe improvements from $1.2$ p.p. to $4.5$ p.p. with increasing gains for larger models.
|
||||
% This improvement is more substantial for the larger models, with ViT-L gaining $4.5$ p.p. in accuracy.
|
||||
\schemename's improvements counteract the drop in performance for increasing model sizes.
|
||||
Without \schemename this drop is $3.8$ p.p. (ViT-S to L), while with \schemename it is reduced to $1.6$ p.p.
|
||||
For DeiT there is a drop of $0.8$ p.p. from small to large while when using \schemename there is a \emph{gain} of $2.4$ p.p.
|
||||
|
||||
\begin{table}
|
||||
\caption{Comparison of \schemename and simple Copy-Paste methods. We train ViT-S on ImageNet using the same 3-augment data augmentation on top of the copy-paste augmentation.}
|
||||
\label{tab:copy-paste-comparison}
|
||||
\centering
|
||||
\resizebox{\columnwidth}{!}{
|
||||
\begin{tabular}{lcc S[table-format=+2.1,retain-explicit-plus,detect-inline-weight=math,detect-weight=true]}
|
||||
\toprule
|
||||
Augmentation & labels & \makecell{ Accuracy [\%]} & {\makecell{Delta \\to Prev.}} \\
|
||||
\midrule
|
||||
% Baseline & & $79.1 \pm 0.1$ \\
|
||||
Baseline + \textbf{Simple Copy-Paste} & bg & $31.3 \pm 0.6$ & \\
|
||||
+ mixed labels & fg + bg & $32.0 \pm 0.8$ & +0.7 \\
|
||||
+ fg labels & fg & $31.6 \pm 0.9$ & -0.4 \\
|
||||
+ \emph{range} foreground size variation & \gtxt{fg} & $43.0 \pm 1.2$ & \bfseries +11.4 \\
|
||||
+ infilled backgrounds & \gtxt{fg} & $68.7 \pm 0.2$ & \bfseries +25.7 \\
|
||||
+ \emph{cos} mixing strategy & \gtxt{fg} & $81.2 \pm 0.1$ & \bfseries +12.5 \\
|
||||
+ edge smoothing & \gtxt{fg} & $81.3 \pm 0.1$ & +0.1 \\
|
||||
+ background pruning$=$ \textbf{\schemename} & \gtxt{fg} & $81.4 \pm 0.1$ & +0.1 \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
\textbf{Comparison to Simple Copy-Paste.}
|
||||
We compare \schemename to a simple adaption of the Copy-Paste augmentation inspired by \cite{Ge2023,Ghiasi2020,Shermaine2025} in \Cref{tab:copy-paste-comparison}.
|
||||
Contrary to semantic segmentation we do not have foreground masks available.
|
||||
Thus, we paste the extracted foreground objects from \emph{\schemename's segmentation stage} onto normal ImageNet images.
|
||||
% Since such images do not have straight forward classification labels, we test multiple possibilities.
|
||||
We observe 3 large jumps in accuracy: (\textbf{1}) From our \emph{range} foreground size variation (+11.4\%), (\textbf{2}) from using our infilled backgrounds instead of images from the dataset (+25.7\%), and (\textbf{3}) from our \emph{cos} mixing strategy with non-augmented images (+12.5\%).
|
||||
\schemename's changes to the naive copy-paste augmentation are thus imperative for good classification performance.
|
||||
|
||||
\begin{table}[t]
|
||||
\caption{Downstream accuracy in percent when finetuning on other datasets. Models are pretrained on ImageNet with and without \schemename. Pretraining using \schemename increases transformer downstream accuracy.
|
||||
% on all datasets.
|
||||
}
|
||||
\label{tab:downstream-results}
|
||||
\centering
|
||||
\resizebox{\columnwidth}{!}{\begin{tabular}{lcccccc}
|
||||
\toprule
|
||||
Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\
|
||||
\midrule
|
||||
ViT-S & \xmark & $72.4\pm1.0$ & $89.8\pm0.3$ & $94.5\pm0.2$ & $89.1\pm0.1$ & $93.8\pm0.2$ \\
|
||||
ViT-S & \cmark & $78.6\pm0.5$ & $92.2\pm0.2$ & $95.5\pm0.2$ & $89.6\pm0.1$ & $94.5\pm0.2$ \\
|
||||
& & \grntxt{$+6.2$} & \grntxt{$+2.4$} & \grntxt{$+1.0$} & \grntxt{$+0.5$} & \grntxt{$+0.7$} \\
|
||||
\cmidrule(r){1-1}
|
||||
ViT-B & \xmark & $71.7\pm0.5$ & $90.0\pm0.2$ & $94.8\pm0.4$ & $89.8\pm0.2$ & $94.1\pm0.4$ \\
|
||||
ViT-B & \cmark & $79.0\pm2.2$ & $93.3\pm0.1$ & $ 96.5\pm0.1$ & $90.9\pm0.1$ & $95.1\pm0.4$ \\
|
||||
& & \grntxt{$+7.3$} & \grntxt{$+3.3$} & \grntxt{$+1.7$} & \grntxt{$+1.1$} & \grntxt{$+1.0$} \\
|
||||
\cmidrule(r){1-1}
|
||||
ViT-L & \xmark & $72.1\pm1.0$ & $88.8\pm0.3$ & $94.4\pm0.3$ & $90.1\pm0.2$ & $94.2\pm0.4$ \\
|
||||
ViT-L & \cmark & $77.6\pm1.2$ & $89.1\pm0.2$ & $96.6\pm0.1$ & $91.3\pm0.1$ & $95.1\pm0.1$ \\
|
||||
& & \grntxt{$+5.5$} & \grntxt{$+0.3$} & \grntxt{$+2.2$} & \grntxt{$+1.2$} & \grntxt{$+0.9$} \\
|
||||
\midrule
|
||||
DeiT-S & \xmark & $75.3\pm0.4$ & $91.1\pm0.2$ & $94.8\pm0.4$ & $89.2\pm0.2$ & $92.4\pm0.2$ \\
|
||||
DeiT-S & \cmark & $76.8\pm0.8$ & $91.9\pm0.2$ & $95.2\pm0.3$ & $89.1\pm0.2$ & $92.3\pm0.4$ \\
|
||||
& & \grntxt{$+1.5$} & \grntxt{$+0.8$} & \grntxt{$+0.4$} & \gtxt{$-0.1$} & \gtxt{$-0.1$} \\
|
||||
\cmidrule(r){1-1}
|
||||
DeiT-B & \xmark & $77.0\pm1.2$ & $92.9\pm0.2$ & $96.1\pm0.2$ & $91.2\pm0.1$ & $93.3\pm0.4$ \\
|
||||
DeiT-B & \cmark & $79.3\pm0.3$ & $93.1\pm0.1$ & $96.4\pm0.2$ & $91.3\pm0.1$ & $93.3\pm0.1$ \\
|
||||
& & \grntxt{$+2.3$} & \gtxt{$+0.2$} & \grntxt{$+0.3$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
|
||||
\cmidrule(r){1-1}
|
||||
DeiT-L & \xmark & $72.8\pm5.5$ & $92.8\pm1.0$ & $95.8\pm1.5$ & $90.5\pm2.6$ & $92.4\pm2.0$ \\
|
||||
DeiT-L & \cmark & $78.8\pm0.8$ & $93.8\pm0.2$ & $97.0\pm0.2$ & $92.0\pm0.2$ & $93.5\pm0.2$ \\
|
||||
& & \grntxt{$+6.0$} & \grntxt{$+1.0$} & \grntxt{$+1.2$} & \grntxt{$+1.5$} & \grntxt{$+1.1$} \\
|
||||
\midrule
|
||||
Swin-Ti & \xmark & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\
|
||||
Swin-Ti & \cmark & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\
|
||||
& & \grntxt{$+4.1$} & \grntxt{$+2.5$} & \grntxt{$+0.3$} & \grntxt{$+0.4$} & \grntxt{$+0.6$} \\
|
||||
\cmidrule(r){1-1}
|
||||
Swin-S & \xmark & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\
|
||||
Swin-S & \cmark & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\
|
||||
& & \grntxt{$+5.7$} & \grntxt{$+2.1$} & \grntxt{$+1.4$} & \gtxt{$+0.1$} & \grntxt{$+0.5$} \\
|
||||
\midrule
|
||||
ResNet-50 & \xmark & $78.2\pm0.5$ & $89.8\pm0.2$ & $91.7\pm0.4$ & $84.4\pm0.2$ & $93.7\pm0.3$ \\
|
||||
ResNet-50 & \cmark & $80.3\pm0.4$ & $90.4\pm0.2$ & $91.7\pm0.2$ & $84.5\pm0.2$ & $93.7\pm0.3$ \\
|
||||
& & \grntxt{$+2.1$} & \grntxt{$+0.6$} & \gtxt{$\pm0.0$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
|
||||
\cmidrule(r){1-1}
|
||||
ResNet-101 & \xmark & $78.4\pm0.6$ & $90.3\pm0.1$ & $91.2\pm0.5$ & $86.0\pm0.2$ & $94.3\pm0.2$ \\
|
||||
ResNet-101 & \cmark & $81.4\pm0.5$ & $91.3\pm0.1$ & $92.9\pm0.2$ & $86.3\pm0.1$ & $94.0\pm0.3$ \\
|
||||
& & \grntxt{$+3.0$} & \grntxt{$+1.3$} & \grntxt{$+1.7$} & \grntxt{$+0.3$} & \textcolor{red}{$-0.3$} \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
|
||||
\textbf{Downstream tasks.} To assess the transferability of \schemename-trained models, we finetune models pretrained on ImageNet with and without \schemename on five fine-grained datasets:
|
||||
@@ -203,102 +347,24 @@ In \Cref{tab:downstream-results} we see transformer accuracies improve on all th
|
||||
% and a reduction of error rate of up to $39.3\%$.
|
||||
% Notably, training with \name increases the downstream performance of DeiT-S and DeiT-B, even though the ImageNet results were the same.
|
||||
% This demonstrates that the improved representations from training on \name translate to superior performance beyond gains from better ImageNet performance.
|
||||
Notably, training with \schemename boosts the downstream performance of DeiT-S and DeiT-B, despite similar ImageNet accuracy.
|
||||
This shows, that the improved representations from training with \schemename translate to gains beyond better ImageNet scores.
|
||||
Notably, training with \schemename boosts the downstream performance of DeiT-S and DeiT-B, despite similar ImageNet results.
|
||||
This shows the improved representations from training with \schemename translate to gains beyond better ImageNet scores.
|
||||
% not only on ImageNet, but also on fine-grained image classification tasks.
|
||||
|
||||
\begin{table}[t]
|
||||
\caption{Evaluation of models trained on ImageNet with and without \schemename. \schemename generally increases models' robustness to different image distribution shifts. Note that ViT-S \emph{with} \schemename outperforms DeiT-S, the only model where \schemename does not increase robustness.}
|
||||
\label{tab:robustness-datasets}
|
||||
\begin{subfigure}{.485\textwidth}
|
||||
\resizebox{\textwidth}{!}{
|
||||
\begin{tabular}{lccccccc}
|
||||
\toprule
|
||||
Model & w/ \schemename & IN-Hard & IN-A & IN-C & IN-R & IN-V2 \\
|
||||
\midrule
|
||||
ViT-S & \xmark & $18.1 \pm 0.6$ & $18.8 \pm 0.2$ & $44.7 \pm 0.8$ & $41.6 \pm 0.6$ & $67.3 \pm 0.4$ \\
|
||||
ViT-S & \cmark & $21.0 \pm 0.4$ & $26.5 \pm 0.4$ & $52.6 \pm 0.6$ & $49.8 \pm 0.3$ & $70.6 \pm 0.1$ \\
|
||||
& & \grntxt{$+2.9$} & \grntxt{$+7.7$} & \grntxt{$+7.9$} & \grntxt{$+8.1$} & \grntxt{$+3.3$} \\
|
||||
\midrule
|
||||
ViT-B & \xmark & $17.0 \pm 0.4$ & $15.8 \pm 0.7$ & $40.4 \pm 0.8$ & $38.4 \pm 0.7$ & $65.1 \pm 0.6$ \\
|
||||
ViT-B & \cmark & $22.0 \pm 0.9$ & $31.9 \pm 1.5$ & $51.6 \pm 1.8$ & $48.7 \pm 1.7$ & $70.3 \pm 0.9$ \\
|
||||
& & \grntxt{$+5.0$} & \grntxt{$+16.0$} & \grntxt{$+11.2$} & \grntxt{$+10.3$} & \grntxt{$+5.2$} \\
|
||||
\midrule
|
||||
ViT-L & \xmark & $15.6 \pm 0.4$ & $11.3 \pm 0.9$ & $38.4 \pm 1.0$ & $36.8 \pm 0.8$ & $61.6 \pm 0.8$ \\
|
||||
ViT-L & \cmark & $20.6 \pm 0.1$ & $30.4 \pm 0.5$ & $48.2 \pm 0.7$ & $46.0 \pm 0.4$ & $68.7 \pm 0.3$ \\
|
||||
& & \grntxt{$+5.0$} & \grntxt{$+19.0$} & \grntxt{$+9.8$} & \grntxt{$+9.3$} & \grntxt{$+7.1$} \\
|
||||
\midrule
|
||||
Swin-Ti & \xmark & $16.2 \pm 0.4$ & $15.0 \pm 0.3$ & $36.0 \pm 0.8$ & $36.6 \pm 0.2$ & $65.5 \pm 0.4$ \\
|
||||
Swin-Ti & \cmark & $18.3 \pm 0.3$ & $20.3 \pm 0.4$ & $41.4 \pm 0.8$ & $41.4 \pm 0.2$ & $68.2 \pm 0.4$ \\
|
||||
& & \grntxt{$+2.2$} & \grntxt{$+5.4$} & \grntxt{$+5.4$} & \grntxt{$+4.8$} & \grntxt{$+2.7$} \\
|
||||
\midrule
|
||||
Swin-S & \xmark & $18.2 \pm 0.3$ & $19.4 \pm 0.3$ & $39.0 \pm 0.7$ & $39.1 \pm 0.2$ & $67.5 \pm 0.1$ \\
|
||||
Swin-S & \cmark & $20.5 \pm 0.1$ & $27.7 \pm 0.4$ & $45.6 \pm 0.8$ & $44.1 \pm 0.3$ & $69.6 \pm 0.1$ \\
|
||||
& & \grntxt{$+2.2$} & \grntxt{$+8.4$} & \grntxt{$+6.6$} & \grntxt{$+5.0$} & \grntxt{$+2.2$} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}{.505\textwidth}
|
||||
\resizebox{\textwidth}{!}{
|
||||
\begin{tabular}{lccccccc}
|
||||
\toprule
|
||||
Model & w/ \schemename & IN-Hard & IN-A & IN-C & IN-R & IN-V2 \\
|
||||
\midrule
|
||||
DeiT-S & \xmark & $19.5 \pm 0.2$ & $18.4 \pm 0.3$ & $58.8 \pm 0.7$ & $43.0 \pm 0.1$ & $68.8 \pm 0.2$ \\
|
||||
DeiT-S & \cmark & $18.5 \pm 0.5$ & $17.3 \pm 1.0$ & $57.0 \pm 0.9$ & $43.8 \pm 0.2$ & $68.7 \pm 0.6$ \\
|
||||
& & \rdtxt{$-1.0$} & \rdtxt{$-1.1$} & \rdtxt{$-1.8$} & \grntxt{$+0.8$} & \gtxt{$-0.1$} \\
|
||||
\midrule
|
||||
DeiT-B & \xmark & $22.6 \pm 0.2$ & $26.0 \pm 0.2$ & $62.1 \pm 1.0$ & $45.6 \pm 1.9$ & $70.6 \pm 0.9$ \\
|
||||
DeiT-B & \cmark & $22.6 \pm 0.2$ & $25.0 \pm 0.3$ & $62.8 \pm 0.6$ & $47.7 \pm 0.8$ & $70.8 \pm 0.5$ \\
|
||||
& & \gtxt{$\pm 0.0$} & \rdtxt{$-1.0$} & \grntxt{$+0.8$} & \grntxt{$+2.0$} & \gtxt{$+0.2$} \\
|
||||
\midrule
|
||||
DeiT-L & \xmark & $21.2 \pm 2.0$ & $20.2 \pm 3.4$ & $59.3 \pm 4.3$ & $41.3 \pm 2.7$ & $66.9 \pm 2.8$ \\
|
||||
DeiT-L & \cmark & $23.4 \pm 0.3$ & $28.8 \pm 2.0$ & $63.4 \pm 0.7$ & $47.8 \pm 0.6$ & $71.6 \pm 0.5$ \\
|
||||
& & \grntxt{$+2.2$} & \grntxt{$+8.7$} & \grntxt{$+4.1$} & \grntxt{$+6.5$} & \grntxt{$+4.7$} \\
|
||||
\midrule
|
||||
ResNet50 & \xmark & $16.1 \pm 0.2$ & $9.7 \pm 0.1$ & $38.0 \pm 1.0$ & $40.5 \pm 0.6$ & $66.8 \pm 0.4$ \\
|
||||
ResNet50 & \cmark & $17.2 \pm 0.1$ & $10.8 \pm 0.4$ & $41.0 \pm 0.7$ & $43.7 \pm 0.3$ & $67.5 \pm 0.1$ \\
|
||||
& & \grntxt{$+1.1$} & \grntxt{$+1.1$} & \grntxt{$+3.0$} & \grntxt{$+3.2$} & \grntxt{$+0.7$} \\
|
||||
\midrule
|
||||
ResNet101 & \xmark & $18.2 \pm 0.4$ & $14.3 \pm 0.1$ & $41.7 \pm 0.7$ & $42.3 \pm 0.1$ & $67.7 \pm 0.5$ \\
|
||||
ResNet101 & \cmark & $19.9 \pm 0.2$ & $17.6 \pm 0.5$ & $46.3 \pm 0.6$ & $46.3 \pm 0.3$ & $69.5 \pm 0.3$ \\
|
||||
& & \grntxt{$+1.7$} & \grntxt{$+3.2$} & \grntxt{$+4.6$} & \grntxt{$+4.0$} & \grntxt{$+1.8$} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
}
|
||||
\end{subfigure}
|
||||
\end{table}
|
||||
|
||||
\subsection{Bias and Robustness Evaluation}
|
||||
% Additional to just using \name for training, its special properties and posibilities for adjustment of the data distribution make it a valuable tool for evaluating other model properties and biases.
|
||||
Beyond its use for training, \schemename's unique properties and controlled data generation capabilities make it a powerful tool for analyzing behavior and biases of black-box models.
|
||||
We exploit this in two complementary ways.
|
||||
First, we ask whether \schemename-trained models are more robust on \emph{external} ImageNet robustness benchmarks that are not generated by our pipeline.
|
||||
Second, we use \schemename's fine-grained control for targeted evaluation of specific dimensions of model bias, such as background reliance and center/size bias.
|
||||
% Together, these experiments allow us to both \emph{probe} and \emph{improve} robustness along clearly defined axes.
|
||||
% This combination of standard benchmarks and controlled probes allows us to both quantify robustness improvements and attribute them to changes in particular model behaviors.
|
||||
|
||||
\textbf{Robustness on External Distribution Shifts.}
|
||||
\Cref{tab:robustness-datasets} summarizes accuracy on five widely used ImageNet robustness benchmarks: ImageNet-Hard~\cite{Taesiri2023}, ImageNet-A~\cite{Hendrycks2021}, ImageNet-C~\cite{Hendrycks2019}, ImageNet-R~\cite{Hendrycks2021a}, and ImageNetV2~\cite{Recht2019}.
|
||||
Across ViTs, Swin Transformers, and ResNets, incorporating \schemename during training generally improves robustness to all considered distribution shifts.
|
||||
For ViTs, the gains are substantial: for example, ViT-B improves from $15.8\%$ to $31.9\%$ accuracy on ImageNet-A ($+16.0$ p.p.) and from $40.4\%$ to $51.6\%$ on ImageNet-C ($+11.2$ p.p.), with similar improvements for ViT-S and ViT-L.
|
||||
Swin also benefits consistently, with increases of roughly $2$--$8$ p.p. on most benchmarks, and ResNet sees smaller but steady gains (e.g., up to $+4.6$ points on ImageNet-C).
|
||||
|
||||
For DeiT, the picture is more nuanced: DeiT-B and DeiT-L still enjoy robustness improvements, whereas DeiT-S exhibits small decreases on several benchmarks.
|
||||
Interestingly, however, ViT-S trained with \schemename outperforms the DeiT-S baseline.
|
||||
This suggests that controlled composition can partially close the robustness gap between lightly and heavily regularized models.
|
||||
Overall, the consistent improvements on corruption-based, natural and hard examples indicate that the compositional invariances induced by \schemename extend beyond the specific foreground/background manipulations used in its construction.
|
||||
|
||||
\begin{figure*}[t]
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{img/bg_robustness.pdf}
|
||||
\caption{Evaluation of background robustness on ImageNet + \schemename, ImageNet9~\cite{Xiao2020} and CounterAnimal~\cite{Wang2024f}.
|
||||
We plot the in-distribution (top of arrow) and the out-of-distribution (bottom of arrow) accuracy when training with and without \schemename.
|
||||
We annotate each arrow with its length $\Delta$.
|
||||
Training with \schemename improves the background robustness of all transformers by mostly boosting the out-of-distribution accuracy.
|
||||
}
|
||||
\label{fig:background-robustness}
|
||||
\begin{figure*}
|
||||
\centering
|
||||
\includegraphics[width=.95\textwidth]{img/bg_robustness.pdf}
|
||||
\caption{Evaluation of background robustness on ImageNet + \schemename, ImageNet9 and CounterAnimal.
|
||||
We plot the in-distribution (top of arrow) and the out-of-distribution (bottom of arrow) accuracy when training with and without \schemename.
|
||||
We annotate each arrow with its length $\Delta$.
|
||||
Training with \schemename improves the background robustness of all transformers by mostly boosting the out-of-distribution accuracy.
|
||||
}
|
||||
\label{fig:background-robustness}
|
||||
\end{figure*}
|
||||
|
||||
\textbf{Background Robustness.}
|
||||
@@ -311,18 +377,19 @@ We assess the robustness of models to shifts in the background distribution from
|
||||
% \text{Background Robustness} = \frac{\text{Acc}(\name_\text{all})}{\text{Acc}(\name_\text{same})}
|
||||
% \end{align}
|
||||
% It represents the relative drop in performance under a background distribution shift.
|
||||
\Cref{fig:background-robustness} presents the background robustness results for three datasets: ImageNet with \schemename (all backgrounds vs. backgrounds of same class), ImageNet9~\cite{Xiao2020} (random backgrounds vs. original backgrounds), and CounterAnimal~\cite{Wang2024f} (counter vs. common background).
|
||||
\Cref{fig:background-robustness} presents the background robustness results for three datasets: ImageNet with \schemename (all backgrounds vs. backgrounds of same class), ImageNet9 \cite{Xiao2020} (random backgrounds vs. original backgrounds), and CounterAnimal \cite{Wang2024f} (counter vs. common background).
|
||||
The top triangle of each arrow represents the in-distribution backgrounds and the bottom triangle represents the out-of-distribution ones.
|
||||
We follow ImageNet9 and CounterAnimal and assess the background robustness in terms of the accuracy gap when evaluating a model on images of normal background distribution compared to out-of-distribution backgrounds (length of each arrow; $\Delta$).
|
||||
% When trained on ImageNet, smaller models generally exhibit greater robustness to changes in the background distribution than larger models and ResNet is more robust than the tested Transformer models.
|
||||
Crucially, \schemename improves the background robustness of all models and across datasets, reducing the background-gap by boosting the performance on the out-of-background-distribution samples more than the in-distribution ones.
|
||||
We find a similar trend for the Corner-Cases~\cite{Fatima2025} dataset (see supplementary), highlighting the generalization benefits of \schemename to unusual image compositions.
|
||||
% to $\approx1.00$, meaning that these models are agnostic to the choice of background and only classify based on the foreground.
|
||||
These findings highlight the generalization benefits of \schemename to unusual image compositions.
|
||||
|
||||
\begin{figure*}[t]
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{img/fg_focus.pdf}
|
||||
\caption{Evaluation of the foreground focus (\Cref{eq:fg-focus}) using GradCam, GradCam++ and IntegratedGradients (IG) of models trained on ImageNet. Training with \schemename improves the foreground focus of almost all models.}
|
||||
\label{fig:foreground-focus}
|
||||
\begin{figure*}
|
||||
\centering
|
||||
\includegraphics[width=.95\textwidth]{img/fg_focus.pdf}
|
||||
\caption{Evaluation of the foreground focus (\Cref{eq:fg-focus}) using GradCam, GradCam++ and IntegratedGradients (IG) of models trained on ImageNet. Training with \schemename improves the foreground focus of almost all models.}
|
||||
\label{fig:foreground-focus}
|
||||
\end{figure*}
|
||||
|
||||
\textbf{Foreground Focus.}
|
||||
@@ -332,7 +399,7 @@ We can directly evaluate ImageNet-trained models, but this technique can also be
|
||||
To evaluate the foreground focus, we employ Grad-CAM \cite{Selvaraju2016}, Grad-CAM++ \cite{Chattopadhay2018} and IntegratedGradients (IG) \cite{Sundararajan2017} to compute the per-pixel importance of an image for the model's prediction.
|
||||
The foreground focus is defined to be the ratio of the foreground's relative importance to its relative size in the image:
|
||||
\begin{align} \label{eq:fg-focus}
|
||||
\text{FG Focus}(\text{img}) = \frac{\text{Area}(\text{img}) \hspace{3pt} \text{Importance}(\text{fg})}{\text{Area}(\text{fg}) \hspace{3pt} \text{Importance}(\text{img})}
|
||||
\text{FG Focus}(\text{img}) = \frac{\text{Area}(\text{img}) \hspace{3pt} \text{Importance}(\text{fg})}{\text{Area}(\text{fg}) \hspace{3pt} \text{Importance}(\text{img})}
|
||||
\end{align}
|
||||
If all pixels uniformly receive the same importance value, the foreground focus is one.
|
||||
The foreground focus of a model is its average focus over all test images.
|
||||
@@ -345,59 +412,46 @@ We hypothesize Swin's below-uniform foreground focus with GradCam is due to its
|
||||
% These differences might be due to the way GradCam is calculated for Swin \todo{cite package website where this is from} and the \todo{common critique of GradCam}.
|
||||
|
||||
\begin{table}[t]
|
||||
\caption{
|
||||
% Evaluation of the center bias.
|
||||
Accuracy relative to the center accuracy of multiple instantiations of the models when the foreground objects is in different cells of a $3 \times 3$ grid.
|
||||
We calculate center bias according to \Cref{eq:center-bias}.
|
||||
Using \schemename significantly reduces models' center bias.}
|
||||
\label{tab:center-bias}
|
||||
\begin{subfigure}{.48\columnwidth}
|
||||
\resizebox{\textwidth}{!}{
|
||||
\begin{tabular}{lccc}
|
||||
\toprule
|
||||
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\
|
||||
\cmidrule(lr){2-3}
|
||||
& w/o \schemename & w/ \schemename \\
|
||||
\midrule
|
||||
ViT-S & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_ImageNet_v3.pdf} & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_RecombNet_all_v3.pdf} \\
|
||||
& $25.5\pm0.8$ & $22.0\pm0.3$ & \grntxt{$-3.5$} \\
|
||||
ViT-B & {\includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_ImageNet_v3.pdf}} & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_RecombNet_all_v3.pdf} \\
|
||||
& $25.4\pm0.4$ & $19.0\pm0.2$ & \grntxt{$-6.4$} \\
|
||||
ViT-L & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_ImageNet_v3.pdf} & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_RecombNet_all_v3.pdf} \\
|
||||
& $24.3\pm1.1$ & $11.7\pm0.7$ & \grntxt{$-12.6$} \\
|
||||
\midrule
|
||||
Swin-Ti & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_RecombNet_all_v3.pdf}} \\
|
||||
& $25.0\pm0.7$ & $16.5\pm0.2$ & \grntxt{$-8.5$} \\
|
||||
Swin-S & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_RecombNet_all_v3.pdf}} \\
|
||||
& $23.2\pm0.1$ & $15.6\pm0.2$ & \grntxt{$-7.6$} \\
|
||||
\bottomrule
|
||||
\end{tabular} }
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}{.497\columnwidth}
|
||||
\resizebox{\textwidth}{!}{
|
||||
\begin{tabular}{lccc}
|
||||
\toprule
|
||||
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\
|
||||
\cmidrule(lr){2-3}
|
||||
& w/o \schemename & w/ \schemename \\
|
||||
\midrule
|
||||
DeiT-S & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_ImageNet_v3.pdf} } & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_fornet_all_linear_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_fornet_all_linear_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_fornet_all_linear_v3.pdf}} \\
|
||||
& $20.4 \pm 0.2$ & $21.2 \pm 0.1$ & \gtxt{$+0.8$} \\
|
||||
DeiT-B & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_ImageNet_v3.pdf} } & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_fornet_all_cos_v3.pdf}} \\
|
||||
& $19.0 \pm 0.7$ & $19.0 \pm 0.2$ & \gtxt{$\pm0.0$} \\
|
||||
DeiT-L & { \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_ImageNet_v3.pdf} } & { \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_fornet_all_cos_v3.pdf} } \\
|
||||
& $21.2 \pm 0.2$ & $18.0 \pm 0.2$ & \grntxt{$-3.2$} \\
|
||||
\midrule
|
||||
ResNet50 & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_RecombNet_all_v3.pdf}} \\
|
||||
& $26.3\pm0.3$ & $19.7\pm0.3$ & \grntxt{$-6.6$} \\
|
||||
ResNet101 & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_RecombNet_all_v3.pdf}} \\
|
||||
& $23.0\pm0.3$ & $19.9\pm0.2$ & \grntxt{$-3.1$} \\
|
||||
\bottomrule
|
||||
\end{tabular} }
|
||||
\end{subfigure}
|
||||
\centering
|
||||
\includegraphics[width=.5\columnwidth]{img/colorbar_horizontal.pdf}
|
||||
\caption{
|
||||
% Evaluation of the center bias.
|
||||
Accuracy relative to the center accuracy of multiple instantiations of the models when the foreground objects is in different cells of a $3 \times 3$ grid.
|
||||
We calculate center bias according to \Cref{eq:center-bias}.
|
||||
Using \schemename significantly reduces models' center bias.}
|
||||
\label{tab:center-bias}
|
||||
\centering
|
||||
\resizebox{.78\columnwidth}{!}{
|
||||
\begin{tabular}{lccc}
|
||||
\toprule
|
||||
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\
|
||||
\cmidrule(lr){2-3}
|
||||
& w/o \schemename & w/ \schemename \\
|
||||
\midrule
|
||||
ViT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v3.pdf}} \\
|
||||
& $25.5\pm0.8$ & $22.0\pm0.3$ & \grntxt{$-3.5$} \\
|
||||
ViT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v3.pdf}} \\
|
||||
& $25.4\pm0.4$ & $19.0\pm0.2$ & \grntxt{$-6.4$} \\
|
||||
ViT-L & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v3.pdf}} \\
|
||||
& $24.3\pm1.1$ & $11.7\pm0.7$ & \grntxt{$-12.6$} \\
|
||||
\midrule
|
||||
DeiT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v3.pdf}} \\
|
||||
& $20.4 \pm 0.2$ & $21.2 \pm 0.1$ & \gtxt{$+0.8$} \\
|
||||
DeiT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v3.pdf}} \\
|
||||
& $19.0 \pm 0.7$ & $19.0 \pm 0.2$ & \gtxt{$\pm0.0$} \\
|
||||
DeiT-L & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v3.pdf} } & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v3.pdf} } \\
|
||||
& $21.2 \pm 0.2$ & $18.0 \pm 0.2$ & \grntxt{$-3.2$} \\
|
||||
\midrule
|
||||
Swin-Ti & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v3.pdf}} \\
|
||||
& $25.0\pm0.7$ & $16.5\pm0.2$ & \grntxt{$-8.5$} \\
|
||||
Swin-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v3.pdf}} \\
|
||||
& $23.2\pm0.1$ & $15.6\pm0.2$ & \grntxt{$-7.6$} \\
|
||||
\midrule
|
||||
ResNet50 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v3.pdf}} \\
|
||||
& $26.3\pm0.3$ & $19.7\pm0.3$ & \grntxt{$-6.6$} \\
|
||||
ResNet101 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v3.pdf}} \\
|
||||
& $23.0\pm0.3$ & $19.9\pm0.2$ & \grntxt{$-3.1$} \\
|
||||
\bottomrule
|
||||
\end{tabular} }
|
||||
\includegraphics[width=.8\columnwidth]{img/colorbar_horizontal.pdf}
|
||||
\end{table}
|
||||
|
||||
\textbf{Center Bias.}
|
||||
@@ -413,23 +467,22 @@ The center bias is calculated as one minus the average of the minimum performanc
|
||||
% \end{split}
|
||||
% \end{align}
|
||||
\begin{align} \label{eq:center-bias}
|
||||
\text{Center Bias} = 1 - \frac{\min\limits_{c \in \text{sides}} \text{Acc}(c) + \min\limits_{c \in \text{corners}} \text{Acc}(c)}{2 \text{Acc}(c_\text{center})}
|
||||
\text{Center Bias} = 1 - \frac{\min\limits_{c \in \text{sides}} \text{Acc}(c) + \min\limits_{c \in \text{corners}} \text{Acc}(c)}{2 \text{Acc}(c_\text{center})}
|
||||
\end{align}
|
||||
\Cref{tab:center-bias} visualizes the center bias of three instantiations of each model.
|
||||
Performance is generally highest in the center and lowest in the four corners.
|
||||
Interestingly, ImageNet-trained models perform slightly better when the foreground object is on the right side of the image, compared to the left side, despite our use of random flipping with a probability of $0.5$ during training.
|
||||
% Training on \name reduces the center bias of all models by at least half.
|
||||
Using \schemename significantly reduces center bias across models, with a more uniform performance especially across the middle row.
|
||||
% On corner-cases (see supplementary) we find that
|
||||
% Their accuracy is higher in the center left and right cells than in the center top and bottom ones, which is not the case for ImageNet-trained models.
|
||||
% This demonstrates that \schemename promotes a more uniform spatial attention distribution, counteracting the center-bias of ImageNet.
|
||||
Thus, \schemename makes the model recognize objects across a wider spatial distribution, counteracting the center-bias of ImageNet.
|
||||
|
||||
\begin{figure}[t!]
|
||||
\centering
|
||||
\includegraphics[width=\columnwidth]{img/size_bias_wide.pdf}
|
||||
\caption{Evaluation of the size bias of models trained on ImageNet. We plot the accuracy relative to the accuracy when using the default size ($f_\text{size} = 1.0$).}
|
||||
\label{fig:size-bias}
|
||||
\centering
|
||||
\includegraphics[width=\columnwidth]{img/size_bias_grid.pdf}
|
||||
\caption{Evaluation of the size bias of models trained on ImageNet. We plot the accuracy relative to the accuracy when using the default size ($f_\text{size} = 1.0$).}
|
||||
\label{fig:size-bias}
|
||||
\end{figure}
|
||||
|
||||
\textbf{Size Bias.}
|
||||
@@ -439,87 +492,6 @@ We introduce a size factor $f_\text{size}$ by which we additionally scale the fo
|
||||
Results are normalized by the accuracy when using $f_\text{size} = 1.0$.
|
||||
\Cref{fig:size-bias} shows the size bias curves of models trained with and without \schemename.
|
||||
% When training on \name, the resulting model keeps it's good performance on smaller foreground objects, while models trained on ImageNet fall of faster and lower.
|
||||
Models trained using \schemename perform better, especially with smaller foreground objects.
|
||||
Models trained using \schemename maintain perform better, especially with smaller foreground objects.
|
||||
%, when ImageNet-trained models exhibit a more rapid performance decline.
|
||||
Therefore, \schemename-training improves robustness to variations in object scale, especially for larger models.
|
||||
|
||||
|
||||
\subsection{Design Choices of \schemename}
|
||||
We next analyze key components of \schemename, focusing on three questions: how it compares to simple copy-paste, how background choice affects performance, and how reliably labels are preserved after recomposition.
|
||||
Additional ablations over variants and hyperparameters are provided in the supplementary material.
|
||||
|
||||
\begin{table}[t]
|
||||
\caption{Comparison of \schemename and simple Copy-Paste methods. We train ViT-S on ImageNet using the same 3-augment data augmentation on top of the copy-paste augmentation.}
|
||||
\label{tab:copy-paste-comparison}
|
||||
\centering
|
||||
\resizebox{.66\columnwidth}{!}{
|
||||
\begin{tabular}{lcc S[table-format=+2.1,retain-explicit-plus,detect-inline-weight=math,detect-weight=true]}
|
||||
\toprule
|
||||
Augmentation & labels & \makecell{ Accuracy [\%]} & {\makecell{Delta \\to Prev.}} \\
|
||||
\midrule
|
||||
% Baseline & & $79.1 \pm 0.1$ \\
|
||||
3-Augment + \textbf{Simple Copy-Paste} & bg & $31.3 \pm 0.6$ & \\
|
||||
+ mixed labels & fg + bg & $32.0 \pm 0.8$ & +0.7 \\
|
||||
+ fg labels & fg & $31.6 \pm 0.9$ & -0.4 \\
|
||||
+ \emph{range} foreground size variation & \gtxt{fg} & $43.0 \pm 1.2$ & \bfseries +11.4 \\
|
||||
+ infilled backgrounds & \gtxt{fg} & $68.7 \pm 0.2$ & \bfseries +25.7 \\
|
||||
+ \emph{cos} mixing strategy & \gtxt{fg} & $81.2 \pm 0.1$ & \bfseries +12.5 \\
|
||||
+ edge smoothing & \gtxt{fg} & $81.3 \pm 0.1$ & +0.1 \\
|
||||
+ background pruning$=$ \textbf{\schemename} & \gtxt{fg} & $81.4 \pm 0.1$ & +0.1 \\
|
||||
\bottomrule
|
||||
\end{tabular}}
|
||||
\end{table}
|
||||
\textbf{Comparison to Simple Copy-Paste.}
|
||||
We compare \schemename to a simple adaption of the Copy-Paste augmentation inspired by \cite{Ge2023,Ghiasi2021,Shermaine2025} in \Cref{tab:copy-paste-comparison}.
|
||||
Contrary to semantic segmentation we do not have foreground masks available.
|
||||
Thus, we paste the extracted objects from \textbf{\schemename's segmentation stage} onto normal ImageNet images.
|
||||
% Since such images do not have straight forward classification labels, we test multiple possibilities.
|
||||
We observe 3 large jumps in accuracy: (\textbf{1}) From our \emph{range} foreground size variation (+11.4\%), (\textbf{2}) from using our infilled backgrounds instead of images from the dataset (+25.7\%), and (\textbf{3}) from our \emph{cos} mixing strategy with non-augmented images (+12.5\%).
|
||||
\schemename's changes to the naive copy-paste augmentation are thus imperative for good classification performance.
|
||||
|
||||
\begin{figure}[t]
|
||||
\begin{minipage}[c]{.49\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{img/strategy.pdf}
|
||||
\captionof{figure}{We compare Original, Same-class, and All-classes background selection using ViT-Ti and ViT-S backbones on TinyImageNet.
|
||||
Increasing background diversity consistently improves classification accuracy.
|
||||
}
|
||||
\label{fig:background-strategy}
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}[c]{.49\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{img/mask_expansion.pdf}
|
||||
\captionof{figure}{
|
||||
We vary the foreground mask area for TinyImageNet by shrinking or expanding masks relative to the original outline and report accuracy when training on $100\%$ augmented samples.
|
||||
Performance is stable for expanded masks and degrades rapidly after shrinking masks.
|
||||
}
|
||||
\label{fig:mask-expansion}
|
||||
\end{minipage}
|
||||
\end{figure}
|
||||
|
||||
\textbf{Background Choice Strategy.}
|
||||
\Cref{fig:background-strategy} shows the effect of background selection on TinyImageNet accuracy, where we trade off diversity against context plausibility.
|
||||
% Using the original inpainted background yields the lowest accuracy, indicating limited regularization from contextual cues.
|
||||
% Sampling backgrounds from the same class provides a modest but consistent improvement, suggesting that mild context variation encourages robustness while preserving semantic plausibility.
|
||||
The best performance is achieved by sampling backgrounds from all classes, which introduces substantial context shifts, but leads to the strongest accuracy gains for both ViT-Ti and ViT-S.
|
||||
Thus, aggressive background diversification is more important than context plausibility and acts as an effective form of context-based regularization rather than introducing harmful noise.
|
||||
|
||||
\textbf{Label Integrity.}
|
||||
% We assess the label integrity of \schemename, i.e., whether object labels remain correct after recombination, by verifying that the intended object is accurately extracted.
|
||||
% To this end, we leverage the object bounding box annotations provided in the ImageNet validation set.
|
||||
% Specifically, we compute the \emph{box precision}, defined as the fraction of the predicted mask area that lies within the ground-truth bounding box, obtaining a mean value of $91\%$.
|
||||
% In addition, we measure the \emph{box-to-box IoU}, computed as the IoU between the tight bounding box enclosing the predicted mask and the tight bounding box of the ground-truth annotation, which yields a high $76.1\%$.
|
||||
% Qualitative examples of the predicted masks and bounding boxes are provided in the supplementary material.
|
||||
% We additionally test label integrity under systematic mask perturbations by expanding or shrinking the foreground masks before composition.
|
||||
% Concretely, starting from the original outline, we erode or dilate the mask such that the foreground area changes by some percentage.
|
||||
% \Cref{fig:mask-expansion} shows that accuracy is relatively stable for expanded masks, but drops off significantly for eroded masks, consistent with cropping away semantically important object parts.
|
||||
% This experiment suggests, that \schemename is relatively robust to artifacts from including an object's original background in the foreground mask.
|
||||
% Overall, these results indicate that the segmentation stage of \schemename reliably isolates the target class object, thereby preserving label correctness after recombination.
|
||||
To quantify whether recombined images still depict the intended class, we evaluate the segmentation stage of \schemename on ImageNet validation boxes.
|
||||
Our predicted masks achieve a mean box precision of $91.0\%$ (fraction of mask area inside the ground-truth bounding boxes of the ImageNet validation set) and a high box-to-box IoU of $76.1\%$, indicating that they tightly capture the target object.
|
||||
Qualitative examples of the predicted masks and bounding boxes are provided in the supplementary material.
|
||||
We further probe robustness to mask imprecision by eroding or dilating masks such that the foreground area changes by a fixed percentage before composition.
|
||||
As shown in \Cref{fig:mask-expansion}, accuracy remains stable for expansions but drops sharply under erosion, consistent with removing semantically important object parts.
|
||||
Together, these results suggest that (\textit{i}) \schemename reliably isolates the target objects and preserves label integrity and that (\textit{ii}) \schemename is robust to artifacts from an object's original background and degrades mainly when the foreground no longer contains the full object.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user