ForAug/sec/appendix.tex

% !TeX root = ../supplementary.tex

\section{Extended Bates Distribution}
\begin{figure}[h!]
    \centering
    \includegraphics[width=.7\columnwidth]{img/bates.pdf}
    \caption{Plot of the probability distribution function (PDF) of the extended Bates distribution for different parameters $\eta$. Higher values of $\eta$ concentrate the distribution around the center.}
    \label{fig:bates-pdf}
\end{figure}

% Finally, we analyze the foreground object's positioning in the image.
% We utilize an extended Bates distribution to sample the position of the foreground object.
% The Bates distribution~\cite{Bates1955} with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}.
% Therefore, the larger $\eta$, the more concentrated the distribution is around the center.
% We extend this concept to $\eta \leq -1$ by shifting the distribution away from the center and towards the edges.
% We extend this concept to $\eta \leq -1$ by defining
% \begin{align*}
%     X \sim \text{Bates}(\eta) :\Leftrightarrow s(X) \sim \text{Bates}(-\eta)
% \end{align*}
% for $\eta \leq 1$ with $s$ being the sawtooth function on $[0, 1]$:
% \begin{align}
%     s(x) = \begin{cases}
%                x + 0.5 & \text{if } 0 < x < 0.5       \\
%                x - 0.5 & \text{if } 0.5 \leq x \leq 1
%            \end{cases}
% \end{align}
% Note that $s \circ s = \id$ on $[0, 1]$.
% This way, distributions with $\eta \leq -1$ are more concentrated around the borders.
% $\eta = 1$ and $\eta = -1$ both correspond to the uniform distribution.
% The PDF of this extended Bates distribution is visualized in \Cref{fig:bates-pdf}.

We introduce an extension of the Bates distribution~\cite{Bates1955} to include negative parameters, enabling sampling of foreground object positions away from the image center.
The standard Bates distribution, for $\eta \in \N$, is defined as the mean of $\eta$ independent random variables drawn from a uniform distribution \cite{Jonhson1995}.
A larger $\eta$ value increases the concentration of samples around the distribution's mean, which in this case is the image center.

To achieve an opposite effect--concentrating samples at the image borders--we extend the distribution to $\eta \leq 1$.
\begin{align*}
    X \sim \text{Bates}(\eta) :\Leftrightarrow s(X) \sim \text{Bates}(-\eta)
\end{align*}
This is accomplished by sampling from a standard Bates distribution with parameter $-\eta \geq 1$ and then applying a sawtooth function.
The sawtooth function on the interval $[0,1]$ is defined as
\begin{align}
    s(x) = \begin{cases}
               x + 0.5 & \text{if } 0 < x < 0.5       \\
               x - 0.5 & \text{if } 0.5 \leq x \leq 1
           \end{cases}
\end{align}
This function effectively maps the central portion of the interval to the edges and the edge portions to the center.
For example, a value of 0.3 (central-left) is mapped to 0.8 (edge-right), while 0.8 (edge-right) is mapped to 0.3 (central-left).
This transformation inverts the distribution's concentration, shifting the probability mass from the center to the borders.
We visualize the distribution function of the extended Bates distribution in \Cref{fig:bates-pdf}.
Both $\eta = 1$ and $\eta = -1$ result in a uniform distribution across the image.

\section{Resource Usage of \schemename}
To utilize the proposed \schemename, specific computational resources are necessary, particularly in terms of storage for the output of the segmentation stage and on-the-fly processing of the recombination stage.
The output of \schemename/\name's segmentation step on ImageNet dataset requires 73 GB of additional disk space for the segmentation output, which is separate from the base 147 GB ImageNet size.
The recombination step of \schemename is implemented as a based data loader operation.
It's thus offloaded to the CPU, where it can be heavily parallelized and thus only results in a very minor increase in the training step-time.
For example, using a ViT-B model on an NVIDIA A100 GPU, the average update step-time increased by $1\%$, from $528 \pm 2$ ms to $534 \pm 1$ ms.


\onecolumn
\section{Training Setup}
\label{sec:training_setup}

\begin{table*}[h!]
    \centering
    \begin{tabular}{lcc}
        \toprule
        Parameter                & ViT, Swin, ResNet               & DeiT                            \\
        \midrule
        Image Resolution         & $224 \times 224$                & $224 \times 224$                \\
        Epochs                   & 300                             & 300                             \\
        Learning Rate            & 3e-3                            & S/B: 1e-3, L: 5e-4              \\
        Learning Rate Schedule   & cosine decay                    & cosine decay                    \\
        Batch Size               & 2048                            & 1024                            \\
        GPUs                     & $4\times$ NVIDIA A100/H100/H200 & $4\times$ NVIDIA A100/H100/H200 \\
        Warmup Schedule          & linear                          & linear                          \\
        Warmup Epochs            & 3                               & 3                               \\
        Weight Decay             & 0.02                            & 0.05                            \\
        Label Smoothing          & 0.1                             & 0.1                             \\
        Optimizer                & Lamb \cite{You2020}             & AdamW                           \\
        Data Augmentation Policy & 3-Augment \cite{Touvron2022}    & DeiT \cite{Touvron2021b}        \\
        \bottomrule
    \end{tabular}
    \caption{Training setup and hyperparameters for our ImageNet and \name training.}
    \label{tab:in-setup}
\end{table*}

\begin{table}[h!]
    \centering
    \begin{tabular}{lcccc}
        \toprule
        Dataset  & Batch Size & Epochs & Learning Rate & Num. GPUs \\
        \midrule
        Aircraft & 512        & 500    & 3e-4          & 2         \\
        Cars     & 1024       & 500    & 3e-4          & 4         \\
        Flowers  & 256        & 500    & 3e-4          & 1         \\
        Food     & 2048       & 100    & 3e-4          & 4         \\
        Pets     & 512        & 500    & 3e-4          & 2         \\
        \bottomrule
    \end{tabular}
    \caption{Training setup for finetuning on different downstream datasets. Other settings are the same as in \Cref{tab:in-setup}. For finetuning, we always utilize 3-Augment and the related parameters from the \emph{ViT, Swin, ResNet} column of \Cref{tab:in-setup}}
    \label{tab:downstream-setup}
\end{table}
On ImageNet we use the same training setup as \cite{Nauen2025} and \cite{Touvron2022} without pretraining for ViT, Swin, and ResNet.
For DeiT, we train the same ViT architecture but using the data augmentation scheme and hyperparameters from \cite{Touvron2021b}.
As our focus is on evaluating the changes in accuracy due to \schemename/\name, like \cite{Nauen2025}, we stick to one set of hyperparameters for all models.
We list the settings used for training on ImageNet and \name in \Cref{tab:in-setup} and the ones used for finetuning those weights on the downstream datasets in \Cref{tab:downstream-setup}.
Out implementation is using PyTorch \cite{Paszke2019} and the \emph{timm} library \cite{Wightman2019} for model architectures and basic functions.

\begin{table*}[h!]
    \centering
    \begin{tabular}{ll}
        \toprule
        Parameter        & Value                                                \\
        \midrule
        GPU              & NVIDIA A100/H100/H200                                \\
        CPU              & 24 CPU cores (Intex Xenon) per GPU                   \\
        Memory           & up to 120GB per GPU                                  \\
        Operating System & Enroot container for SLURM based on Ubuntu 24.04 LTS \\
        Python           & 3.12.3                                               \\
        PyTorch          & 2.7.0                                                \\
        TorchVision      & 0.22.0                                               \\
        Timm             & 1.0.15                                               \\
        \bottomrule
    \end{tabular}
    \caption{Hardware and Software specifics used for both training and evaluation.}
    \label{tab:hw-sw-versions}
\end{table*}
\Cref{tab:hw-sw-versions} lists the specific hardware we use, as well as versions of the relevant software packages.


\newpage
\section{Infill Model Comparison}
\begin{table*}[h!]
    \centering
    \resizebox{.95\textwidth}{!}{
        \begin{tabular}{cc@{\hskip 0.3in}cc}
            \toprule
            LaMa                                                                                        & Att. Eraser                                                                                    & LaMa                                                                                        & Att. Eraser                                                                                    \\
            \midrule
            \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00000090.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00000090.JPEG} &
            \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00000890.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00000890.JPEG}                                                                                                                                                                                                \\
            \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00002106.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00002106.JPEG} &
            \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00005045.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00005045.JPEG}                                                                                                                                                                                                \\
            \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00007437.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00007437.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00008542.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00008542.JPEG} \\
            \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00009674.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00009674.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00002743.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00002743.JPEG} \\
            \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00003097.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00003097.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00011629.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00011629.JPEG} \\
            \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00000547.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00000547.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00025256.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00025256.JPEG} \\
            \bottomrule
        \end{tabular}
    }
    \caption{Example infills of LaMa and Attentive Eraser.}
    \label{tab:infill-examples}
\end{table*}
We visualize example infilled images for both LaMa \cite{Suvorov2021} and Attentive Eraser \cite{Sun2024} in \Cref{tab:infill-examples}.
We qualitatively find that while LaMa often leaves repeated textures of blurry spots where the object was erased, Attentive Eraser produces slightly cleaner and more coherent infills of the background.

\newpage
\section{Images with High Infill Ratio}
\begin{table*}[h!]
    \centering
    \begin{tabular}{ccc}
        \toprule
        Infill Ratio & LaMa                                                                                                             & Att. Eraser                                                                                                         \\
        \midrule
        93.7         & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} \\ \\
        95.7         & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} \\ \\
        83.7         & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} \\ \\
        88.2         & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00026530.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00026530.JPEG}}
    \end{tabular}
    \caption{Example infills with a large relative foreground area size that is infilled (infill ratio).}
    \label{tbl:high-rat}
\end{table*}

\Cref{tbl:high-rat} shows infills for images where Grounded SAM \cite{Ren2024} marks a high percentile of the image as the foreground object (Infill Ratio), that has to be erased by the infill models.
While LaMa tends to fill those spots with mostly black or gray and textures similar to what we saw in \Cref{tab:infill-examples}, Attentive Eraser tends to create novel patterns by copying what is left of the background all over the rest of the image.
% We filter out such mostly infilled background using our background pruning hyperparameter $t_\text{prune} = 0.8$.
We filter out all backgrounds that have an infill ratio larger than our pruning threshold $t_\text{prune} = 0.8$.