first commit of eccv data

2026-02-24 11:13:52 +01:00
commit 0e528233a4
22 changed files with 5743 additions and 0 deletions
--- a/sec/method.tex
+++ b/sec/method.tex
@@ -0,0 +1,137 @@
+% !TeX root = ../main.tex
+
+%\begin{figure*}[ht!]
+%    \centering
+%    \includegraphics[width=.9\textwidth]{img/fig-2.pdf}
+%    \caption{Overview of \name. The data creation consists of two stages: (1, offline) Segmentation, where we segment the foreground objects from the background and fill in the background. (3, online) Recombination, where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.}
+%    \label{fig:method}
+%\end{figure*}
+
+\begin{figure*}[t]
+    \centering
+    \includegraphics[width=\textwidth]{img/fig-2.pdf}
+    \caption{Overview of \schemename.
+        We segment the foreground object and inpaint the removed region to obtain a neutral background (Offline, \Cref{sec:segmentation}).
+        We then paste the foreground onto a sampled background while controlling position and scale, then apply standard strong traditional augmentations (Online, \Cref{sec:recombination}).}
+    \label{fig:method}
+\end{figure*}
+
+\section{\schemename}
+\label{sec:method}
+
+% \begin{itemize}
+%     \item[1.] Segment ImageNet
+%     \item Detect and Cutout Foreground
+%     \item Multiple foreground possibilities
+%     \item Foreground mask merging
+%     \item Background infills
+%     \item Foreground/Background Filtering
+%     \item [2.] Recombination
+%     \item Which foreground \& Background
+%     \item Background pruning
+%     \item size
+%     \item positioning
+%     \item Border smoothing
+%     \item Dealing with other data augmentations/transformations
+% \end{itemize}
+
+% We propose a novel dataset, called \name, that improves image classification performance by explicitly separating and recombining foreground objects and plain backgrounds.
+% \name consists of two stages: Segmentation and recombination. Both are visualized in \Cref{fig:method}.
+% We introduce \schemename, a data augmentation scheme designed to enhance Transformer training by explicitly separating and recombining foreground objects and backgrounds.
+% \schemename enhances transformer training by explicitly encoding spatial invariances that these need to learn explicitly in the data.
+% \schemename involves two stages: Segmentation and Recombination, both visualized in \Cref{fig:method}.
+We introduce \schemename, a data augmentation designed to enhance training by embedding spatial invariances, which Transformers would otherwise need to learn implicitly, directly into the training data.
+% It operates by explicitly segmenting and recombining foreground objects and backgrounds.
+\schemename comprises two distinct stages: Segmentation and Recombination. Both are illustrated in \Cref{fig:method}.
+
+
+\subsection{Segmentation}
+\label{sec:segmentation}
+The offline segmentation stage produces reusable assets for recombination.
+% The segmentation stage isolates the foreground objects and their corresponding backgrounds.
+For each labeled training image, we create a pair $(\mathrm{fg},\mathrm{bg})$ consisting of (\textit{i}) a foreground cut-out $\mathrm{fg}$ with an alpha mask and (\textit{ii}) an inpainted background image $\mathrm{bg}$ where the foreground region has been removed.
+This stage is computed once offline and the results are stored for the recombination stage.
+
+\textbf{Generate candidate foreground masks.}
+We obtain foreground candidates with Grounded SAM~\cite{Ren2024} (Grounding DINO~\cite{Liu2024a} + SAM~\cite{Kirillov2023}).
+We leverage the dataset label by prompting the model with ``\code{a <class name>, a type of <object category>}''.
+Here \code{<object category>} is the immediate WordNet hypernym of the class (e.g., ``sorrel'' $\rightarrow$ ``horse''), which improves robustness when the class name is rare or overly specific.
+This can be the case with prompts like ``sorrel'' or ``guenon'', where the more general name ``horse'' or ``monkey'' is more ubiquitous.
+To increase recall, we generate up to $N=3$ masks per image by iteratively moving one level up the hypernym chain (e.g., ``sorrel'' $\rightarrow$ ``horse'' $\rightarrow$ ``equine'' $\dots$).
+We merge near-duplicate masks with pairwise IoU $\ge 0.9$, yielding a small set of $n_i<N$ candidate masks per image $i$.
+We select the best mask per image (according to \Cref{eq:filtering-score}) in a later filtering step, described below.
+
+\textbf{Create neutral backgrounds via object removal.}
+Given a candidate mask, we remove the masked region and inpaint it using an object-removal model (LaMa~\cite{Suvorov2022} or Attentive Eraser~\cite{Sun2025}).
+This produces a visually plausible, ``neutral'' candidate background that can be paired with many foregrounds.
+For an image $i$ we now have $n_i$ foreground objects, extracted from $i$ by cutting out the masked region, each paired with a background where the same mask has been infilled.
+
+\textbf{Select a high-quality pair.}
+Different masks can trade off including the full object versus leaking class cues into the background.
+We therefore score each candidate pair using an ensemble $E$ of six pretrained classifiers (ViT/ResNet/Swin) trained on the original dataset.
+Intuitively, we prefer (\textit{i}) foregrounds that strongly support the ground-truth class and (\textit{ii}) backgrounds that do \emph{not} support the ground-truth class, while (\textit{iii}) discouraging overly large foreground regions.
+For each model $m \in E$, we compute the class scores of the ground truth class $c$, $\P[m(\mathrm{fg})=c]$ on the foreground (with solid-gray background) and $\P[m(\mathrm{bg})=c]$ on the background and combine them with a prior $\operatorname{size}(\cdot)$ (pixel count):
+
+\begin{align} \begin{split} \label{eq:filtering-score}
+        \text{score}(\mathrm{fg}, \mathrm{bg}, c) &= \log \left( \sum_{m \in E} \frac{\P[m(\mathrm{fg}) = c]}{\abs{E}} \right)
+        + \log \left( 1 - \sum_{m \in E} \frac{\P[m(\mathrm{bg}) = c]}{\abs E} \right)                                             \\
+        & + \lambda \log \left( 1 - \abs{\frac{\operatorname{size}(\mathrm{fg})}{\operatorname{size}(\mathrm{bg})} - \eps} \right).
+    \end{split} \end{align}
+% We set $\lambda = 2$ and $\eps = 0.1$ via a small hyperparameter search on a manually annotated subset.
+We run a hyperparameter search using a manually annotated subset of foreground/background variants to find the factors in \Cref{eq:filtering-score}: $\lambda = 2$ and $\eps = 0.1$.
+For each image, we keep the candidate mask with the highest score.
+
+\textbf{Filter low-quality backgrounds.}
+Finally, we discard backgrounds that are heavily ($\geq 80\%$) inpainted, as they tend to look synthetic and provide little useful diversity (see supplementary).
+This step filters out $10\%$ of backgrounds.
+
+Although segmentation is the main computational overhead, it is performed once offline and reused across all training runs.
+On NVIDIA H100 GPUs, the segmentation stage computes at a rate of $5 338.6 \frac{\text{img}}{\text{GPU} \times \text{h}}$ when inpainting with LaMa.
+For ImageNet this comes down to just under $30$ hours on a single node.
+At roughly twice the cost of a single ViT-B training run ($\approx 14$ hours), this is a modest investment that is amortized over every subsequent experiment the dataset is used in.
+For details see the supplementary material.
+% Compare this to $\approx 14$ hours for training ViT-B on ImageNet once.
+The output of the segmentation stage is a collection of foreground cut-outs (with transparency) and a pool of diverse, neutral backgrounds, which we use in the online recombination stage.
+For ImageNet, we provide pre-computed segmentation output\footnote{\code{URL will go here}}.
+
+\subsection{Recombination}
+\label{sec:recombination}
+In each epoch, the recombination stage generates a recombined training sample for each foreground by (\textit{i}) choosing a background, (\textit{ii}) choosing a target foreground size, (\textit{iii}) sampling a placement, and (\textit{iv}) pasting the foreground using its alpha mask.
+This exposes the model to controlled changes in context and spatial layout that are largely absent from standard augmentation.
+
+\textbf{Background sampling.}
+For each foreground object, we draw a background using one of three increasingly challenging strategies:
+(\textit{i}) \textit{Original}: use the object's own inpainted background (no context shift);
+(\textit{ii}) \textit{Same-class}: sample a background from the pool of backgrounds belonging to the same class (slight, but plausible context shift);
+(\textit{iii}) \textit{All-classes}: sample from the pool of all inpainted backgrounds (large context shift).
+These strategies trade off context diversity against semantic plausibility.
+We ensure that each foreground is used exactly once per epoch; backgrounds may repeat.
+
+\textbf{Foreground scaling.}
+Let $r_{\text{fg}}$ denote the relative foreground area in the source image of the foreground, and $r_{\text{bg}}$ the relative foreground area in the source image of the background. % of the \emph{original} foreground (before inpainting) in the chosen background image.
+We compute the lower/upper size limits $(s_l, s_u)$ from these two ratios using one of two variants:
+(\textit{i}) \emph{mean} sets $(s_l, s_u)$ using the mean of $r_{\text{fg}}$ and $r_{\text{bg}}$, while
+(\textit{ii}) \emph{range} uses the min/max to preserve a wider scale range.
+Then, we sample the final scale from a $\pm 30\%$ interval around them and resize the foreground to this scale, while keeping the aspect ratio.
+
+
+\textbf{Placement and boundary smoothing.}
+We paste the resized foreground at a uniformly random location within the background.
+To reduce cut-and-paste artifacts, we slightly soften the alpha mask boundary by applying a Gaussian blur with $\sigma \in [\frac{\sigma_{\text{max}}}{10}, \sigma_{\text{max}}]$, following the range used in modern augmentation~\cite{Touvron2022}.
+% For example recombined images see \Cref{tab:foraug-examples}.
+
+% \textbf{Interaction with standard augmentation.}
+% We support two augmentation orders:
+% (\textit{i}) apply the full augmentation pipeline after recombination; or
+% (\textit{ii}) apply crop+resize to the background first (to keep the full foreground visible), then recombine, then apply the remaining augmentations.
+% The former matches standard training exactly; the latter isolates composition changes from random cropping.
+
+\textbf{Mixing with original images.}
+We optionally mix recombined samples with unmodified dataset images.
+A mixing ratio $p$ acts as the probability of drawing the original image; otherwise we use its foreground and apply \schemename.
+We consider constant $p$ as well as linear/cosine schedules that increase $p$ over training.
+Finally, we apply standard data augmentation techniques on the resulting images.
+
+The online recombination is CPU-parallel and does not measurably increase training time.
+We find a $\approx 1\%$ increase in average step-time (see supplementary).
+