2 Commits

Author SHA1 Message Date
Tobias Christian Nauen
e15c9057d3 packages are in same dir 2026-02-24 11:49:43 +01:00
Tobias Christian Nauen
7333b469ce experiment-notes version 2026-02-24 11:48:19 +01:00
57 changed files with 3285 additions and 5888 deletions

File diff suppressed because one or more lines are too long

View File

@@ -1,21 +0,0 @@
# CVPR/ICCV/3DV Official LaTeX template
**Note:** the Microsoft Word version of the template is in the branch [`main-msword`](https://github.com/cvpr-org/author-kit/tree/main-msword).
### History (in reverse chronological order)
- updated for CVPR 2026 [Vladimir Pavlovic](mailto:vladimir@rutgers.edu)
- added styles for `subsubsection` and fixed the wrong PDF bookmarks by [Di Fang](https://github.com/fang-d)
- modernized for CVPR 2025 by [Christian Richardt](https://richardt.name/)
- fixed page centering for CVPR 2025 by [Stefan Roth](mailto:stefan.roth@NOSPAMtu-darmstadt.de)
- inline enumerations and `cvprblue` links for CVPR 2025 by [Ioannis Gkioulekas
](https://www.cs.cmu.edu/~igkioule/)
- added automated LaTeX build testing for CVPR 2025 by [Ahan Shabanov](https://ahanio.github.io)
- references in `cvprblue` for CVPR 2024 by [Klaus Greff](https://github.com/Qwlouse)
- added natbib for CVPR 2024 by [Christian Richardt](https://richardt.name/)
- replaced buggy (review-mode) line numbering for 3DV 2024 by [Adín Ramírez Rivera
](https://openreview.net/profile?id=~Ad%C3%ADn_Ram%C3%ADrez_Rivera1)
- logic for inline supplementary for 3DV 2024 by [Andrea Tagliasacchi](https://taiya.github.io)
- modernized for CVPR 2022 by [Stefan Roth](mailto:stefan.roth@NOSPAMtu-darmstadt.de)
- created cvpr.sty file to unify review/rebuttal/final versions by [Ming-Ming Cheng](https://github.com/MCG-NKU/CVPR_Template)
- developed CVPR 2005 template by [Paolo Ienne](mailto:Paolo.Ienne@di.epfl.ch) and [Andrew Fitzgibbon](mailto:awf@acm.org)

79
algorithm.sty Normal file
View File

@@ -0,0 +1,79 @@
% ALGORITHM STYLE -- Released 8 April 1996
% for LaTeX-2e
% Copyright -- 1994 Peter Williams
% E-mail Peter.Williams@dsto.defence.gov.au
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{algorithm}
\typeout{Document Style `algorithm' - floating environment}
\RequirePackage{float}
\RequirePackage{ifthen}
\newcommand{\ALG@within}{nothing}
\newboolean{ALG@within}
\setboolean{ALG@within}{false}
\newcommand{\ALG@floatstyle}{ruled}
\newcommand{\ALG@name}{Algorithm}
\newcommand{\listalgorithmname}{List of \ALG@name s}
% Declare Options
% first appearance
\DeclareOption{plain}{
\renewcommand{\ALG@floatstyle}{plain}
}
\DeclareOption{ruled}{
\renewcommand{\ALG@floatstyle}{ruled}
}
\DeclareOption{boxed}{
\renewcommand{\ALG@floatstyle}{boxed}
}
% then numbering convention
\DeclareOption{part}{
\renewcommand{\ALG@within}{part}
\setboolean{ALG@within}{true}
}
\DeclareOption{chapter}{
\renewcommand{\ALG@within}{chapter}
\setboolean{ALG@within}{true}
}
\DeclareOption{section}{
\renewcommand{\ALG@within}{section}
\setboolean{ALG@within}{true}
}
\DeclareOption{subsection}{
\renewcommand{\ALG@within}{subsection}
\setboolean{ALG@within}{true}
}
\DeclareOption{subsubsection}{
\renewcommand{\ALG@within}{subsubsection}
\setboolean{ALG@within}{true}
}
\DeclareOption{nothing}{
\renewcommand{\ALG@within}{nothing}
\setboolean{ALG@within}{true}
}
\DeclareOption*{\edef\ALG@name{\CurrentOption}}
% ALGORITHM
%
\ProcessOptions
\floatstyle{\ALG@floatstyle}
\ifthenelse{\boolean{ALG@within}}{
\ifthenelse{\equal{\ALG@within}{part}}
{\newfloat{algorithm}{htbp}{loa}[part]}{}
\ifthenelse{\equal{\ALG@within}{chapter}}
{\newfloat{algorithm}{htbp}{loa}[chapter]}{}
\ifthenelse{\equal{\ALG@within}{section}}
{\newfloat{algorithm}{htbp}{loa}[section]}{}
\ifthenelse{\equal{\ALG@within}{subsection}}
{\newfloat{algorithm}{htbp}{loa}[subsection]}{}
\ifthenelse{\equal{\ALG@within}{subsubsection}}
{\newfloat{algorithm}{htbp}{loa}[subsubsection]}{}
\ifthenelse{\equal{\ALG@within}{nothing}}
{\newfloat{algorithm}{htbp}{loa}}{}
}{
\newfloat{algorithm}{htbp}{loa}
}
\floatname{algorithm}{\ALG@name}
\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}}

201
algorithmic.sty Normal file
View File

@@ -0,0 +1,201 @@
% ALGORITHMIC STYLE -- Released 8 APRIL 1996
% for LaTeX version 2e
% Copyright -- 1994 Peter Williams
% E-mail PeterWilliams@dsto.defence.gov.au
%
% Modified by Alex Smola (08/2000)
% E-mail Alex.Smola@anu.edu.au
%
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{algorithmic}
\typeout{Document Style `algorithmic' - environment}
%
\RequirePackage{ifthen}
\RequirePackage{calc}
\newboolean{ALC@noend}
\setboolean{ALC@noend}{false}
\newcounter{ALC@line}
\newcounter{ALC@rem}
\newlength{\ALC@tlm}
%
\DeclareOption{noend}{\setboolean{ALC@noend}{true}}
%
\ProcessOptions
%
% ALGORITHMIC
\newcommand{\algorithmicrequire}{\textbf{Require:}}
\newcommand{\algorithmicensure}{\textbf{Ensure:}}
\newcommand{\algorithmiccomment}[1]{\{#1\}}
\newcommand{\algorithmicend}{\textbf{end}}
\newcommand{\algorithmicif}{\textbf{if}}
\newcommand{\algorithmicthen}{\textbf{then}}
\newcommand{\algorithmicelse}{\textbf{else}}
\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif}
\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif}
\newcommand{\algorithmicfor}{\textbf{for}}
\newcommand{\algorithmicforall}{\textbf{for all}}
\newcommand{\algorithmicdo}{\textbf{do}}
\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor}
\newcommand{\algorithmicwhile}{\textbf{while}}
\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile}
\newcommand{\algorithmicloop}{\textbf{loop}}
\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop}
\newcommand{\algorithmicrepeat}{\textbf{repeat}}
\newcommand{\algorithmicuntil}{\textbf{until}}
%changed by alex smola
\newcommand{\algorithmicinput}{\textbf{input}}
\newcommand{\algorithmicoutput}{\textbf{output}}
\newcommand{\algorithmicset}{\textbf{set}}
\newcommand{\algorithmictrue}{\textbf{true}}
\newcommand{\algorithmicfalse}{\textbf{false}}
\newcommand{\algorithmicand}{\textbf{and\ }}
\newcommand{\algorithmicor}{\textbf{or\ }}
\newcommand{\algorithmicfunction}{\textbf{function}}
\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction}
\newcommand{\algorithmicmain}{\textbf{main}}
\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain}
%end changed by alex smola
\def\ALC@item[#1]{%
\if@noparitem \@donoparitem
\else \if@inlabel \indent \par \fi
\ifhmode \unskip\unskip \par \fi
\if@newlist \if@nobreak \@nbitem \else
\addpenalty\@beginparpenalty
\addvspace\@topsep \addvspace{-\parskip}\fi
\else \addpenalty\@itempenalty \addvspace\itemsep
\fi
\global\@inlabeltrue
\fi
\everypar{\global\@minipagefalse\global\@newlistfalse
\if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels
\penalty\z@ \fi
\everypar{}}\global\@nobreakfalse
\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi
\sbox\@tempboxa{\makelabel{#1}}%
\global\setbox\@labels
\hbox{\unhbox\@labels \hskip \itemindent
\hskip -\labelwidth \hskip -\ALC@tlm
\ifdim \wd\@tempboxa >\labelwidth
\box\@tempboxa
\else \hbox to\labelwidth {\unhbox\@tempboxa}\fi
\hskip \ALC@tlm}\ignorespaces}
%
\newenvironment{algorithmic}[1][0]{
\let\@item\ALC@item
\newcommand{\ALC@lno}{%
\ifthenelse{\equal{\arabic{ALC@rem}}{0}}
{{\footnotesize \arabic{ALC@line}:}}{}%
}
\let\@listii\@listi
\let\@listiii\@listi
\let\@listiv\@listi
\let\@listv\@listi
\let\@listvi\@listi
\let\@listvii\@listi
\newenvironment{ALC@g}{
\begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@
\listparindent\z@ \rightmargin\z@
\topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@
\leftmargin 1em
\addtolength{\ALC@tlm}{\leftmargin}
}
}
{\end{list}}
\newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item}
\newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}%
{}{\ \algorithmiccomment{##1}}}
\newcommand{\REQUIRE}{\item[\algorithmicrequire]}
\newcommand{\ENSURE}{\item[\algorithmicensure]}
\newcommand{\STATE}{\ALC@it}
\newcommand{\COMMENT}[1]{\algorithmiccomment{##1}}
%changes by alex smola
\newcommand{\INPUT}{\item[\algorithmicinput]}
\newcommand{\OUTPUT}{\item[\algorithmicoutput]}
\newcommand{\SET}{\item[\algorithmicset]}
% \newcommand{\TRUE}{\algorithmictrue}
% \newcommand{\FALSE}{\algorithmicfalse}
\newcommand{\AND}{\algorithmicand}
\newcommand{\OR}{\algorithmicor}
\newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}}
%end changes by alex smola
\newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}}
\renewcommand{\\}{\@centercr}
\newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\
\algorithmicthen\ {##2}}
\newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\ELSIF}[2][default]%
{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo%
\ALC@com{##1}\begin{ALC@for}}
\newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ %
\algorithmicdo%
\ALC@com{##1}\begin{ALC@for}}
\newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ %
\algorithmicdo\ {##2}}
\newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ %
\algorithmicdo%
\ALC@com{##1}\begin{ALC@whl}}
\newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop%
\ALC@com{##1}\begin{ALC@loop}}
%changed by alex smola
\newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ %
\ALC@com{##1}\begin{ALC@func}}
\newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ %
\ALC@com{##1}\begin{ALC@main}}
%end changed by alex smola
\newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat%
\ALC@com{##1}\begin{ALC@rpt}}
\newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1}
\ifthenelse{\boolean{ALC@noend}}{
\newcommand{\ENDIF}{\end{ALC@if}}
\newcommand{\ENDFOR}{\end{ALC@for}}
\newcommand{\ENDWHILE}{\end{ALC@whl}}
\newcommand{\ENDLOOP}{\end{ALC@loop}}
\newcommand{\ENDFUNCTION}{\end{ALC@func}}
\newcommand{\ENDMAIN}{\end{ALC@main}}
}{
\newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif}
\newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor}
\newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile}
\newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop}
\newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction}
\newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain}
}
\renewcommand{\@toodeep}{}
\begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}%
\itemsep\z@ \itemindent\z@ \listparindent\z@%
\partopsep\z@ \parskip\z@ \parsep\z@%
\labelsep 0.5em \topsep 0.2em%
\ifthenelse{\equal{#1}{0}}
{\labelwidth 0.5em }
{\labelwidth 1.2em }
\leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep}
\ALC@tlm\labelsep
}
}
{\end{list}}

508
cvpr.sty
View File

@@ -1,508 +0,0 @@
% ---------------------------------------------------------------
%
% No guarantee is given that the format corresponds perfectly to
% IEEE 8.5" x 11" Proceedings, but most features should be ok.
%
% ---------------------------------------------------------------
% with LaTeX2e:
% =============
%
% use as
% \documentclass[times,10pt,twocolumn]{article}
% \usepackage[options]{cvpr}
% \usepackage{times}
%
% "options" should be replaced by
% * "review" for submitting a paper for review,
% * "final" for the camera ready, and
% * "rebuttal" for the author rebuttal.
%
% specify references as
% {\small
% \bibliographystyle{ieee}
% \bibliography{...your files...}
% }
% ---------------------------------------------------------------
\NeedsTeXFormat{LaTeX2e}[1999/12/01]
\ProvidesPackage{cvpr}[2026 LaTeX class for IEEE CVPR]
\RequirePackage{times} % Integrate Times for here
\RequirePackage{xspace}
\RequirePackage[dvipsnames]{xcolor}
\RequirePackage{graphicx}
\RequirePackage{amsmath}
\RequirePackage{amssymb}
\RequirePackage{booktabs}
\RequirePackage[numbers,sort&compress]{natbib}
\setlength{\bibsep}{1pt plus 1pt minus 1pt}
\RequirePackage{silence} % Suppress unwanted warnings
\hbadness=10000 \vbadness=10000 \vfuzz=30pt \hfuzz=30pt
\WarningFilter{latexfont}{Font shape declaration}
\WarningFilter{latex}{Font shape}
\WarningFilter[rebuttal]{latex}{No \author given}
\RequirePackage{etoolbox}
% Use modern caption package to allow for sub-figures etc.
% Reproduces the original CVPR/ICCV style as closely as possible.
\RequirePackage[format=plain,labelformat=simple,labelsep=period,font=small,compatibility=false]{caption}
\RequirePackage[font=footnotesize,skip=3pt,subrefformat=parens]{subcaption}
\newtoggle{cvprfinal} % Camera-ready version
\newtoggle{cvprrebuttal} % Rebuttal
\newtoggle{cvprpagenumbers} % Force page numbers (in camera ready)
\toggletrue{cvprfinal}
\togglefalse{cvprrebuttal}
\togglefalse{cvprpagenumbers}
\DeclareOption{review}{\togglefalse{cvprfinal}\toggletrue{cvprpagenumbers}}
\DeclareOption{rebuttal}{\togglefalse{cvprfinal}\toggletrue{cvprrebuttal}}
\DeclareOption{pagenumbers}{\toggletrue{cvprpagenumbers}}
\DeclareOption*{\PackageWarning{cvpr}{Unkown option `\CurrentOption'}}
\ProcessOptions\relax
% Don't warn about missing author for rebuttal
\iftoggle{cvprrebuttal}{%
\ActivateWarningFilters[rebuttal]
}{}
% Breaking lines for URLs in the bib
\RequirePackage[hyphens]{url}
\Urlmuskip=0mu plus 1mu\relax
% ---------------------------------------------------------------
% Inlined version of the obsolete "everyshi-2001-05-15" package.
\newcommand{\@EveryShipout@Hook}{}
\newcommand{\@EveryShipout@AtNextHook}{}
\newcommand*{\EveryShipout}[1]
{\g@addto@macro\@EveryShipout@Hook{#1}}
\newcommand*{\AtNextShipout}[1]
{\g@addto@macro\@EveryShipout@AtNextHook{#1}}
\newcommand{\@EveryShipout@Shipout}{%
\afterassignment\@EveryShipout@Test
\global\setbox\@cclv= %
}
\newcommand{\@EveryShipout@Test}{%
\ifvoid\@cclv\relax
\aftergroup\@EveryShipout@Output
\else
\@EveryShipout@Output
\fi%
}
\newcommand{\@EveryShipout@Output}{%
\@EveryShipout@Hook%
\@EveryShipout@AtNextHook%
\gdef\@EveryShipout@AtNextHook{}%
\@EveryShipout@Org@Shipout\box\@cclv%
}
\newcommand{\@EveryShipout@Org@Shipout}{}
\newcommand*{\@EveryShipout@Init}{%
\message{ABD: EveryShipout initializing macros}%
\let\@EveryShipout@Org@Shipout\shipout
\let\shipout\@EveryShipout@Shipout
}
\AtBeginDocument{\@EveryShipout@Init}
% ---------------------------------------------------------------
% ---------------------------------------------------------------
% Inlined simplified version of the "eso-pic" package.
\newcommand\LenToUnit[1]{#1\@gobble}
\newcommand\AtPageUpperLeft[1]{%
\begingroup
\@tempdima=0pt\relax\@tempdimb=\ESO@yoffsetI\relax
\put(\LenToUnit{\@tempdima},\LenToUnit{\@tempdimb}){#1}%
\endgroup
}
\newcommand\AtPageLowerLeft[1]{\AtPageUpperLeft{%
\put(0,\LenToUnit{-\paperheight}){#1}}}
\newcommand\AtPageCenter[1]{\AtPageUpperLeft{%
\put(\LenToUnit{.5\paperwidth},\LenToUnit{-.5\paperheight}){#1}}%
}
\newcommand\AtTextUpperLeft[1]{%
\begingroup
\setlength\@tempdima{1in}%
\ifodd\c@page%
\advance\@tempdima\oddsidemargin%
\else%
\advance\@tempdima\evensidemargin%
\fi%
\@tempdimb=\ESO@yoffsetI\relax\advance\@tempdimb-1in\relax%
\advance\@tempdimb-\topmargin%
\advance\@tempdimb-\headheight\advance\@tempdimb-\headsep%
\put(\LenToUnit{\@tempdima},\LenToUnit{\@tempdimb}){#1}%
\endgroup
}
\newcommand\AtTextLowerLeft[1]{\AtTextUpperLeft{%
\put(0,\LenToUnit{-\textheight}){#1}}}
\newcommand\AtTextCenter[1]{\AtTextUpperLeft{%
\put(\LenToUnit{.5\textwidth},\LenToUnit{-.5\textheight}){#1}}}
\newcommand{\ESO@HookI}{} \newcommand{\ESO@HookII}{}
\newcommand{\ESO@HookIII}{}
\newcommand{\AddToShipoutPicture}{%
\@ifstar{\g@addto@macro\ESO@HookII}{\g@addto@macro\ESO@HookI}}
\newcommand{\ClearShipoutPicture}{\global\let\ESO@HookI\@empty}
\newcommand\ESO@isMEMOIR[1]{}
\@ifclassloaded{memoir}{\renewcommand\ESO@isMEMOIR[1]{#1}}{}
\newcommand{\@ShipoutPicture}{%
\bgroup
\@tempswafalse%
\ifx\ESO@HookI\@empty\else\@tempswatrue\fi%
\ifx\ESO@HookII\@empty\else\@tempswatrue\fi%
\ifx\ESO@HookIII\@empty\else\@tempswatrue\fi%
\if@tempswa%
\@tempdima=1in\@tempdimb=-\@tempdima%
\advance\@tempdimb\ESO@yoffsetI%
\ESO@isMEMOIR{%
\advance\@tempdima\trimedge%
\advance\@tempdima\paperwidth%
\advance\@tempdima-\stockwidth%
\if@twoside\ifodd\c@page\else%
\advance\@tempdima-2\trimedge%
\advance\@tempdima-\paperwidth%
\advance\@tempdima\stockwidth%
\fi\fi%
\advance\@tempdimb\trimtop}%
\unitlength=1pt%
\global\setbox\@cclv\vbox{%
\vbox{\let\protect\relax
\pictur@(0,0)(\strip@pt\@tempdima,\strip@pt\@tempdimb)%
\ESO@HookIII\ESO@HookI\ESO@HookII%
\global\let\ESO@HookII\@empty%
\endpicture}%
\nointerlineskip%
\box\@cclv}%
\fi
\egroup
}
\EveryShipout{\@ShipoutPicture}
\RequirePackage{keyval}
\newif\ifESO@dvips\ESO@dvipsfalse
\newif\ifESO@texcoord\ESO@texcoordfalse
\AtBeginDocument{%
\IfFileExists{color.sty}
{%
\RequirePackage{color}
\let\ESO@color=\color\let\ESO@colorbox=\colorbox
\let\ESO@fcolorbox=\fcolorbox
}{}
\@ifundefined{Gin@driver}{}%
{%
\ifx\Gin@driver\@empty\else%
\filename@parse{\Gin@driver}\def\reserved@a{dvips}%
\ifx\filename@base\reserved@a\ESO@dvipstrue\fi%
\fi
}%
\ifx\pdfoutput\undefined\else
\ifx\pdfoutput\relax\else
\ifcase\pdfoutput\else
\ESO@dvipsfalse%
\fi
\fi
\fi
}
\ifESO@texcoord
\def\ESO@yoffsetI{0pt}\def\ESO@yoffsetII{-\paperheight}
\else
\def\ESO@yoffsetI{\paperheight}\def\ESO@yoffsetII{0pt}
\fi
% ---------------------------------------------------------------
\typeout{CVPR 8.5 x 11-Inch Proceedings Style `cvpr.sty'.}
% ten point helvetica bold required for captions
% eleven point times bold required for second-order headings
% in some sites the name of the fonts may differ,
% change the name here:
\font\cvprtenhv = phvb at 8pt % *** IF THIS FAILS, SEE cvpr.sty ***
\font\elvbf = ptmb scaled 1100
\font\tenbf = ptmb scaled 1000
% If the above lines give an error message, try to comment them and
% uncomment these:
%\font\cvprtenhv = phvb7t at 8pt
%\font\elvbf = ptmb7t scaled 1100
%\font\tenbf = ptmb7t scaled 1000
% set dimensions of columns, gap between columns, and paragraph indent
\setlength{\textheight}{8.875in}
\setlength{\textwidth}{6.875in}
\setlength{\columnsep}{0.3125in}
\setlength{\topmargin}{0in}
\setlength{\headheight}{0in}
\setlength{\headsep}{0in}
\setlength{\parindent}{1pc}
\setlength{\oddsidemargin}{-0.1875in}
\setlength{\evensidemargin}{-0.1875in}
% Suppress page numbers when the appropriate option is given
\iftoggle{cvprpagenumbers}{}{%
\pagestyle{empty}
}
\AtBeginDocument{%
% Print an error if document class other than article is used
\@ifclassloaded{article}{}{%
\PackageError{cvpr}{Package only meant to be used with document class `article'}{Change document class to `article'.}
}
% Print a warning if incorrect options for article are specified
\@ifclasswith{article}{10pt}{}{%
\PackageWarningNoLine{cvpr}{Incorrect font size specified - CVPR requires 10-point fonts. Please load document class `article' with `10pt' option}
}
\@ifclasswith{article}{twocolumn}{}{%
\PackageWarningNoLine{cvpr}{Single column document - CVPR requires papers to have two-column layout. Please load document class `article' with `twocolumn' option}
}
\@ifclasswith{article}{letterpaper}{}{%
\PackageWarningNoLine{cvpr}{Incorrect paper size - CVPR uses paper size `letter'. Please load document class `article' with `letterpaper' option}
}
% Print a warning if hyperref is not loaded and/or if the pagebackref option is missing
\iftoggle{cvprfinal}{%
\@ifpackageloaded{hyperref}{}{%
\PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded, but highly recommended for camera-ready version}
}
}{%
\@ifpackageloaded{hyperref}{
\@ifpackagewith{hyperref}{pagebackref}{}{
\PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded with option `pagebackref', which is strongly recommended for review version}
}
}{%
\PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded, but strongly recommended for review version}
}
}
}
\def\@maketitle{
\newpage
\null
\iftoggle{cvprrebuttal}{\vspace*{-.3in}}{\vskip .375in}
\begin{center}
% smaller title font only for rebuttal
\iftoggle{cvprrebuttal}{{\large \bf \@title \par}}{{\Large \bf \@title \par}}
% additional two empty lines at the end of the title
\iftoggle{cvprrebuttal}{\vspace*{-22pt}}{\vspace*{24pt}}{
\large
\lineskip .5em
\begin{tabular}[t]{c}
\iftoggle{cvprfinal}{
\@author
}{
\iftoggle{cvprrebuttal}{}{
Anonymous \confName~submission\\
\vspace*{1pt}\\
Paper ID \paperID
}
}
\end{tabular}
\par
}
% additional small space at the end of the author name
\vskip .5em
% additional empty line at the end of the title block
\vspace*{12pt}
\end{center}
}
\def\abstract{%
% Suppress page numbers when the appropriate option is given
\iftoggle{cvprpagenumbers}{}{%
\thispagestyle{empty}
}
\centerline{\large\bf Abstract}%
\vspace*{12pt}\noindent%
\it\ignorespaces%
}
\def\endabstract{%
% additional empty line at the end of the abstract
\vspace*{12pt}
}
\def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{}
% correct heading spacing and type
\def\cvprsection{\@startsection {section}{1}{\z@}
{-10pt plus -2pt minus -2pt}{7pt} {\large\bf}}
\def\cvprssect#1{\cvprsection*{#1}}
\def\cvprsect#1{\cvprsection{\texorpdfstring{\hskip -1em.~}{}#1}}
\def\section{\@ifstar\cvprssect\cvprsect}
\def\cvprsubsection{\@startsection {subsection}{2}{\z@}
{-8pt plus -2pt minus -2pt}{5pt} {\elvbf}}
\def\cvprssubsect#1{\cvprsubsection*{#1}}
\def\cvprsubsect#1{\cvprsubsection{\texorpdfstring{\hskip -1em.~}{}#1}}
\def\subsection{\@ifstar\cvprssubsect\cvprsubsect}
\def\cvprsubsubsection{\@startsection {subsubsection}{3}{\z@}
{-6pt plus -2pt minus -2pt}{3pt} {\tenbf}}
\def\cvprssubsubsect#1{\cvprsubsubsection*{#1}}
\def\cvprsubsubsect#1{\cvprsubsubsection{\texorpdfstring{\hskip -1em.~}{}#1}}
\def\subsubsection{\@ifstar\cvprssubsubsect\cvprsubsubsect}
%% --------- Page background marks: Ruler and confidentiality (only for review and rebuttal)
\iftoggle{cvprfinal}{
% In review and rebuttal mode, we use the "lineno" package for numbering lines.
% When switching to a different mode, the "\@LN" macro may remain in cached .aux files,
% leading to build errors (https://github.com/cvpr-org/author-kit/issues/49).
% Defining the macro as empty fixes that (https://tex.stackexchange.com/a/125779).
\makeatletter
\providecommand{\@LN}[2]{}
\makeatother
}{
% ----- define vruler
\makeatletter
\newbox\cvprrulerbox
\newcount\cvprrulercount
\newdimen\cvprruleroffset
\newdimen\cv@lineheight
\newdimen\cv@boxheight
\newbox\cv@tmpbox
\newcount\cv@refno
\newcount\cv@tot
% NUMBER with left flushed zeros \fillzeros[<WIDTH>]<NUMBER>
\newcount\cv@tmpc@ \newcount\cv@tmpc
\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
\cv@tmpc=1 %
\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
\ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
\ifnum#2<0\advance\cv@tmpc1\relax-\fi
\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
\makeatother
% ----- end of vruler
%% Define linenumber setup
\RequirePackage[switch,mathlines]{lineno}
% Line numbers in CVPR blue using font from \cvprtenhv
\renewcommand\linenumberfont{\cvprtenhv\color[rgb]{.5,.5,1}}
\renewcommand\thelinenumber{\fillzeros[3]{\arabic{linenumber}}}
\setlength{\linenumbersep}{.75cm}
% Bug: An equation with $$ ... $$ isn't numbered, nor is the previous line.
% Patch amsmath commands so that the previous line and the equation itself
% are numbered. Bug: multiline has an extra line number.
% https://tex.stackexchange.com/questions/461186/how-to-use-lineno-with-amsmath-align
\RequirePackage{etoolbox} %% <- for \pretocmd, \apptocmd and \patchcmd
\newcommand*\linenomathpatch[1]{%
\expandafter\pretocmd\csname #1\endcsname {\linenomath}{}{}%
\expandafter\pretocmd\csname #1*\endcsname {\linenomath}{}{}%
\expandafter\apptocmd\csname end#1\endcsname {\endlinenomath}{}{}%
\expandafter\apptocmd\csname end#1*\endcsname {\endlinenomath}{}{}%
}
\newcommand*\linenomathpatchAMS[1]{%
\expandafter\pretocmd\csname #1\endcsname {\linenomathAMS}{}{}%
\expandafter\pretocmd\csname #1*\endcsname {\linenomathAMS}{}{}%
\expandafter\apptocmd\csname end#1\endcsname {\endlinenomath}{}{}%
\expandafter\apptocmd\csname end#1*\endcsname {\endlinenomath}{}{}%
}
%% Definition of \linenomathAMS depends on whether the mathlines option is provided
\expandafter\ifx\linenomath\linenomathWithnumbers
\let\linenomathAMS\linenomathWithnumbers
%% The following line gets rid of an extra line numbers at the bottom:
\patchcmd\linenomathAMS{\advance\postdisplaypenalty\linenopenalty}{}{}{}
\else
\let\linenomathAMS\linenomathNonumbers
\fi
% Add the numbers
\linenumbers
\AtBeginDocument{%
\linenomathpatch{equation}%
\linenomathpatchAMS{gather}%
\linenomathpatchAMS{multline}%
\linenomathpatchAMS{align}%
\linenomathpatchAMS{alignat}%
\linenomathpatchAMS{flalign}%
}
% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
\def\cvprruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\cvprrulerbox}}
\AddToShipoutPicture{%
\color[rgb]{.5,.5,1}
\def\pid{\parbox{1in}{\begin{center}\bf\sf{\small \confName}\\\small \#\paperID\end{center}}}
\AtTextUpperLeft{%paperID in corners
\put(\LenToUnit{-65pt},\LenToUnit{45pt}){\pid}
\put(\LenToUnit{\textwidth-12pt},\LenToUnit{45pt}){\pid}
}
\AtTextUpperLeft{%confidential
\put(0,\LenToUnit{1cm}){\parbox{\textwidth}{\centering\cvprtenhv
\confName~\confYear~Submission \#\paperID. CONFIDENTIAL REVIEW COPY. DO NOT DISTRIBUTE.}}
}
}
} % end of not cvprfinal
%%% Make figure placement a little more predictable.
% We trust the user to move figures if this results
% in ugliness.
% Minimize bad page breaks at figures
\renewcommand{\textfraction}{0.01}
\renewcommand{\floatpagefraction}{0.99}
\renewcommand{\topfraction}{0.99}
\renewcommand{\bottomfraction}{0.99}
\renewcommand{\dblfloatpagefraction}{0.99}
\renewcommand{\dbltopfraction}{0.99}
\setcounter{totalnumber}{99}
\setcounter{topnumber}{99}
\setcounter{bottomnumber}{99}
% Add a period to the end of an abbreviation unless there's one
% already, then \xspace.
\makeatletter
\DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot}
\def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}
\def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot}
\def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot}
\def\cf{\emph{cf}\onedot} \def\Cf{\emph{Cf}\onedot}
\def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot}
\def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot}
\def\iid{i.i.d\onedot} \def\wolog{w.l.o.g\onedot}
\def\etal{\emph{et al}\onedot}
\makeatother
% ---------------------------------------------------------------
%% redefine the \title command so that a variable name is saved in \thetitle, and provides the \maketitlesupplementary command
\let\titleold\title
\renewcommand{\title}[1]{\titleold{#1}\newcommand{\thetitle}{#1}}
\def\maketitlesupplementary
{
\newpage
\twocolumn[
\centering
\Large
\textbf{\thetitle}\\
\vspace{0.5em}Supplementary Material \\
\vspace{1.0em}
] %< twocolumn
}
% ---------------------------------------------------------------
%% Support for easy cross-referencing (e.g. \cref{sec:intro}
% configured with \AtEndPreamble as it needs to be called after hyperref
\AtEndPreamble{
\usepackage[capitalize]{cleveref}
\crefname{section}{Sec.}{Secs.}
\Crefname{section}{Section}{Sections}
\Crefname{table}{Table}{Tables}
\crefname{table}{Tab.}{Tabs.}
}
% ---------------------------------------------------------------
%% More compact compact itemize/enumeration (e.g. list contributions)
\RequirePackage[shortlabels,inline]{enumitem}
\setlist[itemize]{noitemsep,leftmargin=*,topsep=0em}
\setlist[enumerate]{noitemsep,leftmargin=*,topsep=0em}

485
fancyhdr.sty Normal file
View File

@@ -0,0 +1,485 @@
% fancyhdr.sty version 3.2
% Fancy headers and footers for LaTeX.
% Piet van Oostrum,
% Dept of Computer and Information Sciences, University of Utrecht,
% Padualaan 14, P.O. Box 80.089, 3508 TB Utrecht, The Netherlands
% Telephone: +31 30 2532180. Email: piet@cs.uu.nl
% ========================================================================
% LICENCE:
% This file may be distributed under the terms of the LaTeX Project Public
% License, as described in lppl.txt in the base LaTeX distribution.
% Either version 1 or, at your option, any later version.
% ========================================================================
% MODIFICATION HISTORY:
% Sep 16, 1994
% version 1.4: Correction for use with \reversemargin
% Sep 29, 1994:
% version 1.5: Added the \iftopfloat, \ifbotfloat and \iffloatpage commands
% Oct 4, 1994:
% version 1.6: Reset single spacing in headers/footers for use with
% setspace.sty or doublespace.sty
% Oct 4, 1994:
% version 1.7: changed \let\@mkboth\markboth to
% \def\@mkboth{\protect\markboth} to make it more robust
% Dec 5, 1994:
% version 1.8: corrections for amsbook/amsart: define \@chapapp and (more
% importantly) use the \chapter/sectionmark definitions from ps@headings if
% they exist (which should be true for all standard classes).
% May 31, 1995:
% version 1.9: The proposed \renewcommand{\headrulewidth}{\iffloatpage...
% construction in the doc did not work properly with the fancyplain style.
% June 1, 1995:
% version 1.91: The definition of \@mkboth wasn't restored on subsequent
% \pagestyle{fancy}'s.
% June 1, 1995:
% version 1.92: The sequence \pagestyle{fancyplain} \pagestyle{plain}
% \pagestyle{fancy} would erroneously select the plain version.
% June 1, 1995:
% version 1.93: \fancypagestyle command added.
% Dec 11, 1995:
% version 1.94: suggested by Conrad Hughes <chughes@maths.tcd.ie>
% CJCH, Dec 11, 1995: added \footruleskip to allow control over footrule
% position (old hardcoded value of .3\normalbaselineskip is far too high
% when used with very small footer fonts).
% Jan 31, 1996:
% version 1.95: call \@normalsize in the reset code if that is defined,
% otherwise \normalsize.
% this is to solve a problem with ucthesis.cls, as this doesn't
% define \@currsize. Unfortunately for latex209 calling \normalsize doesn't
% work as this is optimized to do very little, so there \@normalsize should
% be called. Hopefully this code works for all versions of LaTeX known to
% mankind.
% April 25, 1996:
% version 1.96: initialize \headwidth to a magic (negative) value to catch
% most common cases that people change it before calling \pagestyle{fancy}.
% Note it can't be initialized when reading in this file, because
% \textwidth could be changed afterwards. This is quite probable.
% We also switch to \MakeUppercase rather than \uppercase and introduce a
% \nouppercase command for use in headers. and footers.
% May 3, 1996:
% version 1.97: Two changes:
% 1. Undo the change in version 1.8 (using the pagestyle{headings} defaults
% for the chapter and section marks. The current version of amsbook and
% amsart classes don't seem to need them anymore. Moreover the standard
% latex classes don't use \markboth if twoside isn't selected, and this is
% confusing as \leftmark doesn't work as expected.
% 2. include a call to \ps@empty in ps@@fancy. This is to solve a problem
% in the amsbook and amsart classes, that make global changes to \topskip,
% which are reset in \ps@empty. Hopefully this doesn't break other things.
% May 7, 1996:
% version 1.98:
% Added % after the line \def\nouppercase
% May 7, 1996:
% version 1.99: This is the alpha version of fancyhdr 2.0
% Introduced the new commands \fancyhead, \fancyfoot, and \fancyhf.
% Changed \headrulewidth, \footrulewidth, \footruleskip to
% macros rather than length parameters, In this way they can be
% conditionalized and they don't consume length registers. There is no need
% to have them as length registers unless you want to do calculations with
% them, which is unlikely. Note that this may make some uses of them
% incompatible (i.e. if you have a file that uses \setlength or \xxxx=)
% May 10, 1996:
% version 1.99a:
% Added a few more % signs
% May 10, 1996:
% version 1.99b:
% Changed the syntax of \f@nfor to be resistent to catcode changes of :=
% Removed the [1] from the defs of \lhead etc. because the parameter is
% consumed by the \@[xy]lhead etc. macros.
% June 24, 1997:
% version 1.99c:
% corrected \nouppercase to also include the protected form of \MakeUppercase
% \global added to manipulation of \headwidth.
% \iffootnote command added.
% Some comments added about \@fancyhead and \@fancyfoot.
% Aug 24, 1998
% version 1.99d
% Changed the default \ps@empty to \ps@@empty in order to allow
% \fancypagestyle{empty} redefinition.
% Oct 11, 2000
% version 2.0
% Added LPPL license clause.
%
% A check for \headheight is added. An errormessage is given (once) if the
% header is too large. Empty headers don't generate the error even if
% \headheight is very small or even 0pt.
% Warning added for the use of 'E' option when twoside option is not used.
% In this case the 'E' fields will never be used.
%
% Mar 10, 2002
% version 2.1beta
% New command: \fancyhfoffset[place]{length}
% defines offsets to be applied to the header/footer to let it stick into
% the margins (if length > 0).
% place is like in fancyhead, except that only E,O,L,R can be used.
% This replaces the old calculation based on \headwidth and the marginpar
% area.
% \headwidth will be dynamically calculated in the headers/footers when
% this is used.
%
% Mar 26, 2002
% version 2.1beta2
% \fancyhfoffset now also takes h,f as possible letters in the argument to
% allow the header and footer widths to be different.
% New commands \fancyheadoffset and \fancyfootoffset added comparable to
% \fancyhead and \fancyfoot.
% Errormessages and warnings have been made more informative.
%
% Dec 9, 2002
% version 2.1
% The defaults for \footrulewidth, \plainheadrulewidth and
% \plainfootrulewidth are changed from \z@skip to 0pt. In this way when
% someone inadvertantly uses \setlength to change any of these, the value
% of \z@skip will not be changed, rather an errormessage will be given.
% March 3, 2004
% Release of version 3.0
% Oct 7, 2004
% version 3.1
% Added '\endlinechar=13' to \fancy@reset to prevent problems with
% includegraphics in header when verbatiminput is active.
% March 22, 2005
% version 3.2
% reset \everypar (the real one) in \fancy@reset because spanish.ldf does
% strange things with \everypar between << and >>.
\def\ifancy@mpty#1{\def\temp@a{#1}\ifx\temp@a\@empty}
\def\fancy@def#1#2{\ifancy@mpty{#2}\fancy@gbl\def#1{\leavevmode}\else
\fancy@gbl\def#1{#2\strut}\fi}
\let\fancy@gbl\global
\def\@fancyerrmsg#1{%
\ifx\PackageError\undefined
\errmessage{#1}\else
\PackageError{Fancyhdr}{#1}{}\fi}
\def\@fancywarning#1{%
\ifx\PackageWarning\undefined
\errmessage{#1}\else
\PackageWarning{Fancyhdr}{#1}{}\fi}
% Usage: \@forc \var{charstring}{command to be executed for each char}
% This is similar to LaTeX's \@tfor, but expands the charstring.
\def\@forc#1#2#3{\expandafter\f@rc\expandafter#1\expandafter{#2}{#3}}
\def\f@rc#1#2#3{\def\temp@ty{#2}\ifx\@empty\temp@ty\else
\f@@rc#1#2\f@@rc{#3}\fi}
\def\f@@rc#1#2#3\f@@rc#4{\def#1{#2}#4\f@rc#1{#3}{#4}}
% Usage: \f@nfor\name:=list\do{body}
% Like LaTeX's \@for but an empty list is treated as a list with an empty
% element
\newcommand{\f@nfor}[3]{\edef\@fortmp{#2}%
\expandafter\@forloop#2,\@nil,\@nil\@@#1{#3}}
% Usage: \def@ult \cs{defaults}{argument}
% sets \cs to the characters from defaults appearing in argument
% or defaults if it would be empty. All characters are lowercased.
\newcommand\def@ult[3]{%
\edef\temp@a{\lowercase{\edef\noexpand\temp@a{#3}}}\temp@a
\def#1{}%
\@forc\tmpf@ra{#2}%
{\expandafter\if@in\tmpf@ra\temp@a{\edef#1{#1\tmpf@ra}}{}}%
\ifx\@empty#1\def#1{#2}\fi}
%
% \if@in <char><set><truecase><falsecase>
%
\newcommand{\if@in}[4]{%
\edef\temp@a{#2}\def\temp@b##1#1##2\temp@b{\def\temp@b{##1}}%
\expandafter\temp@b#2#1\temp@b\ifx\temp@a\temp@b #4\else #3\fi}
\newcommand{\fancyhead}{\@ifnextchar[{\f@ncyhf\fancyhead h}%
{\f@ncyhf\fancyhead h[]}}
\newcommand{\fancyfoot}{\@ifnextchar[{\f@ncyhf\fancyfoot f}%
{\f@ncyhf\fancyfoot f[]}}
\newcommand{\fancyhf}{\@ifnextchar[{\f@ncyhf\fancyhf{}}%
{\f@ncyhf\fancyhf{}[]}}
% New commands for offsets added
\newcommand{\fancyheadoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyheadoffset h}%
{\f@ncyhfoffs\fancyheadoffset h[]}}
\newcommand{\fancyfootoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyfootoffset f}%
{\f@ncyhfoffs\fancyfootoffset f[]}}
\newcommand{\fancyhfoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyhfoffset{}}%
{\f@ncyhfoffs\fancyhfoffset{}[]}}
% The header and footer fields are stored in command sequences with
% names of the form: \f@ncy<x><y><z> with <x> for [eo], <y> from [lcr]
% and <z> from [hf].
\def\f@ncyhf#1#2[#3]#4{%
\def\temp@c{}%
\@forc\tmpf@ra{#3}%
{\expandafter\if@in\tmpf@ra{eolcrhf,EOLCRHF}%
{}{\edef\temp@c{\temp@c\tmpf@ra}}}%
\ifx\@empty\temp@c\else
\@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
[#3]}%
\fi
\f@nfor\temp@c{#3}%
{\def@ult\f@@@eo{eo}\temp@c
\if@twoside\else
\if\f@@@eo e\@fancywarning
{\string#1's `E' option without twoside option is useless}\fi\fi
\def@ult\f@@@lcr{lcr}\temp@c
\def@ult\f@@@hf{hf}{#2\temp@c}%
\@forc\f@@eo\f@@@eo
{\@forc\f@@lcr\f@@@lcr
{\@forc\f@@hf\f@@@hf
{\expandafter\fancy@def\csname
f@ncy\f@@eo\f@@lcr\f@@hf\endcsname
{#4}}}}}}
\def\f@ncyhfoffs#1#2[#3]#4{%
\def\temp@c{}%
\@forc\tmpf@ra{#3}%
{\expandafter\if@in\tmpf@ra{eolrhf,EOLRHF}%
{}{\edef\temp@c{\temp@c\tmpf@ra}}}%
\ifx\@empty\temp@c\else
\@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
[#3]}%
\fi
\f@nfor\temp@c{#3}%
{\def@ult\f@@@eo{eo}\temp@c
\if@twoside\else
\if\f@@@eo e\@fancywarning
{\string#1's `E' option without twoside option is useless}\fi\fi
\def@ult\f@@@lcr{lr}\temp@c
\def@ult\f@@@hf{hf}{#2\temp@c}%
\@forc\f@@eo\f@@@eo
{\@forc\f@@lcr\f@@@lcr
{\@forc\f@@hf\f@@@hf
{\expandafter\setlength\csname
f@ncyO@\f@@eo\f@@lcr\f@@hf\endcsname
{#4}}}}}%
\fancy@setoffs}
% Fancyheadings version 1 commands. These are more or less deprecated,
% but they continue to work.
\newcommand{\lhead}{\@ifnextchar[{\@xlhead}{\@ylhead}}
\def\@xlhead[#1]#2{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#2}}
\def\@ylhead#1{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#1}}
\newcommand{\chead}{\@ifnextchar[{\@xchead}{\@ychead}}
\def\@xchead[#1]#2{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#2}}
\def\@ychead#1{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#1}}
\newcommand{\rhead}{\@ifnextchar[{\@xrhead}{\@yrhead}}
\def\@xrhead[#1]#2{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#2}}
\def\@yrhead#1{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#1}}
\newcommand{\lfoot}{\@ifnextchar[{\@xlfoot}{\@ylfoot}}
\def\@xlfoot[#1]#2{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#2}}
\def\@ylfoot#1{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#1}}
\newcommand{\cfoot}{\@ifnextchar[{\@xcfoot}{\@ycfoot}}
\def\@xcfoot[#1]#2{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#2}}
\def\@ycfoot#1{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#1}}
\newcommand{\rfoot}{\@ifnextchar[{\@xrfoot}{\@yrfoot}}
\def\@xrfoot[#1]#2{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#2}}
\def\@yrfoot#1{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#1}}
\newlength{\fancy@headwidth}
\let\headwidth\fancy@headwidth
\newlength{\f@ncyO@elh}
\newlength{\f@ncyO@erh}
\newlength{\f@ncyO@olh}
\newlength{\f@ncyO@orh}
\newlength{\f@ncyO@elf}
\newlength{\f@ncyO@erf}
\newlength{\f@ncyO@olf}
\newlength{\f@ncyO@orf}
\newcommand{\headrulewidth}{0.4pt}
\newcommand{\footrulewidth}{0pt}
\newcommand{\footruleskip}{.3\normalbaselineskip}
% Fancyplain stuff shouldn't be used anymore (rather
% \fancypagestyle{plain} should be used), but it must be present for
% compatibility reasons.
\newcommand{\plainheadrulewidth}{0pt}
\newcommand{\plainfootrulewidth}{0pt}
\newif\if@fancyplain \@fancyplainfalse
\def\fancyplain#1#2{\if@fancyplain#1\else#2\fi}
\headwidth=-123456789sp %magic constant
% Command to reset various things in the headers:
% a.o. single spacing (taken from setspace.sty)
% and the catcode of ^^M (so that epsf files in the header work if a
% verbatim crosses a page boundary)
% It also defines a \nouppercase command that disables \uppercase and
% \Makeuppercase. It can only be used in the headers and footers.
\let\fnch@everypar\everypar% save real \everypar because of spanish.ldf
\def\fancy@reset{\fnch@everypar{}\restorecr\endlinechar=13
\def\baselinestretch{1}%
\def\nouppercase##1{{\let\uppercase\relax\let\MakeUppercase\relax
\expandafter\let\csname MakeUppercase \endcsname\relax##1}}%
\ifx\undefined\@newbaseline% NFSS not present; 2.09 or 2e
\ifx\@normalsize\undefined \normalsize % for ucthesis.cls
\else \@normalsize \fi
\else% NFSS (2.09) present
\@newbaseline%
\fi}
% Initialization of the head and foot text.
% The default values still contain \fancyplain for compatibility.
\fancyhf{} % clear all
% lefthead empty on ``plain'' pages, \rightmark on even, \leftmark on odd pages
% evenhead empty on ``plain'' pages, \leftmark on even, \rightmark on odd pages
\if@twoside
\fancyhead[el,or]{\fancyplain{}{\sl\rightmark}}
\fancyhead[er,ol]{\fancyplain{}{\sl\leftmark}}
\else
\fancyhead[l]{\fancyplain{}{\sl\rightmark}}
\fancyhead[r]{\fancyplain{}{\sl\leftmark}}
\fi
\fancyfoot[c]{\rm\thepage} % page number
% Use box 0 as a temp box and dimen 0 as temp dimen.
% This can be done, because this code will always
% be used inside another box, and therefore the changes are local.
\def\@fancyvbox#1#2{\setbox0\vbox{#2}\ifdim\ht0>#1\@fancywarning
{\string#1 is too small (\the#1): ^^J Make it at least \the\ht0.^^J
We now make it that large for the rest of the document.^^J
This may cause the page layout to be inconsistent, however\@gobble}%
\dimen0=#1\global\setlength{#1}{\ht0}\ht0=\dimen0\fi
\box0}
% Put together a header or footer given the left, center and
% right text, fillers at left and right and a rule.
% The \lap commands put the text into an hbox of zero size,
% so overlapping text does not generate an errormessage.
% These macros have 5 parameters:
% 1. LEFTSIDE BEARING % This determines at which side the header will stick
% out. When \fancyhfoffset is used this calculates \headwidth, otherwise
% it is \hss or \relax (after expansion).
% 2. \f@ncyolh, \f@ncyelh, \f@ncyolf or \f@ncyelf. This is the left component.
% 3. \f@ncyoch, \f@ncyech, \f@ncyocf or \f@ncyecf. This is the middle comp.
% 4. \f@ncyorh, \f@ncyerh, \f@ncyorf or \f@ncyerf. This is the right component.
% 5. RIGHTSIDE BEARING. This is always \relax or \hss (after expansion).
\def\@fancyhead#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
\@fancyvbox\headheight{\hbox
{\rlap{\parbox[b]{\headwidth}{\raggedright#2}}\hfill
\parbox[b]{\headwidth}{\centering#3}\hfill
\llap{\parbox[b]{\headwidth}{\raggedleft#4}}}\headrule}}#5}
\def\@fancyfoot#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
\@fancyvbox\footskip{\footrule
\hbox{\rlap{\parbox[t]{\headwidth}{\raggedright#2}}\hfill
\parbox[t]{\headwidth}{\centering#3}\hfill
\llap{\parbox[t]{\headwidth}{\raggedleft#4}}}}}#5}
\def\headrule{{\if@fancyplain\let\headrulewidth\plainheadrulewidth\fi
\hrule\@height\headrulewidth\@width\headwidth \vskip-\headrulewidth}}
\def\footrule{{\if@fancyplain\let\footrulewidth\plainfootrulewidth\fi
\vskip-\footruleskip\vskip-\footrulewidth
\hrule\@width\headwidth\@height\footrulewidth\vskip\footruleskip}}
\def\ps@fancy{%
\@ifundefined{@chapapp}{\let\@chapapp\chaptername}{}%for amsbook
%
% Define \MakeUppercase for old LaTeXen.
% Note: we used \def rather than \let, so that \let\uppercase\relax (from
% the version 1 documentation) will still work.
%
\@ifundefined{MakeUppercase}{\def\MakeUppercase{\uppercase}}{}%
\@ifundefined{chapter}{\def\sectionmark##1{\markboth
{\MakeUppercase{\ifnum \c@secnumdepth>\z@
\thesection\hskip 1em\relax \fi ##1}}{}}%
\def\subsectionmark##1{\markright {\ifnum \c@secnumdepth >\@ne
\thesubsection\hskip 1em\relax \fi ##1}}}%
{\def\chaptermark##1{\markboth {\MakeUppercase{\ifnum \c@secnumdepth>\m@ne
\@chapapp\ \thechapter. \ \fi ##1}}{}}%
\def\sectionmark##1{\markright{\MakeUppercase{\ifnum \c@secnumdepth >\z@
\thesection. \ \fi ##1}}}}%
%\csname ps@headings\endcsname % use \ps@headings defaults if they exist
\ps@@fancy
\gdef\ps@fancy{\@fancyplainfalse\ps@@fancy}%
% Initialize \headwidth if the user didn't
%
\ifdim\headwidth<0sp
%
% This catches the case that \headwidth hasn't been initialized and the
% case that the user added something to \headwidth in the expectation that
% it was initialized to \textwidth. We compensate this now. This loses if
% the user intended to multiply it by a factor. But that case is more
% likely done by saying something like \headwidth=1.2\textwidth.
% The doc says you have to change \headwidth after the first call to
% \pagestyle{fancy}. This code is just to catch the most common cases were
% that requirement is violated.
%
\global\advance\headwidth123456789sp\global\advance\headwidth\textwidth
\fi}
\def\ps@fancyplain{\ps@fancy \let\ps@plain\ps@plain@fancy}
\def\ps@plain@fancy{\@fancyplaintrue\ps@@fancy}
\let\ps@@empty\ps@empty
\def\ps@@fancy{%
\ps@@empty % This is for amsbook/amsart, which do strange things with \topskip
\def\@mkboth{\protect\markboth}%
\def\@oddhead{\@fancyhead\fancy@Oolh\f@ncyolh\f@ncyoch\f@ncyorh\fancy@Oorh}%
\def\@oddfoot{\@fancyfoot\fancy@Oolf\f@ncyolf\f@ncyocf\f@ncyorf\fancy@Oorf}%
\def\@evenhead{\@fancyhead\fancy@Oelh\f@ncyelh\f@ncyech\f@ncyerh\fancy@Oerh}%
\def\@evenfoot{\@fancyfoot\fancy@Oelf\f@ncyelf\f@ncyecf\f@ncyerf\fancy@Oerf}%
}
% Default definitions for compatibility mode:
% These cause the header/footer to take the defined \headwidth as width
% And to shift in the direction of the marginpar area
\def\fancy@Oolh{\if@reversemargin\hss\else\relax\fi}
\def\fancy@Oorh{\if@reversemargin\relax\else\hss\fi}
\let\fancy@Oelh\fancy@Oorh
\let\fancy@Oerh\fancy@Oolh
\let\fancy@Oolf\fancy@Oolh
\let\fancy@Oorf\fancy@Oorh
\let\fancy@Oelf\fancy@Oelh
\let\fancy@Oerf\fancy@Oerh
% New definitions for the use of \fancyhfoffset
% These calculate the \headwidth from \textwidth and the specified offsets.
\def\fancy@offsolh{\headwidth=\textwidth\advance\headwidth\f@ncyO@olh
\advance\headwidth\f@ncyO@orh\hskip-\f@ncyO@olh}
\def\fancy@offselh{\headwidth=\textwidth\advance\headwidth\f@ncyO@elh
\advance\headwidth\f@ncyO@erh\hskip-\f@ncyO@elh}
\def\fancy@offsolf{\headwidth=\textwidth\advance\headwidth\f@ncyO@olf
\advance\headwidth\f@ncyO@orf\hskip-\f@ncyO@olf}
\def\fancy@offself{\headwidth=\textwidth\advance\headwidth\f@ncyO@elf
\advance\headwidth\f@ncyO@erf\hskip-\f@ncyO@elf}
\def\fancy@setoffs{%
% Just in case \let\headwidth\textwidth was used
\fancy@gbl\let\headwidth\fancy@headwidth
\fancy@gbl\let\fancy@Oolh\fancy@offsolh
\fancy@gbl\let\fancy@Oelh\fancy@offselh
\fancy@gbl\let\fancy@Oorh\hss
\fancy@gbl\let\fancy@Oerh\hss
\fancy@gbl\let\fancy@Oolf\fancy@offsolf
\fancy@gbl\let\fancy@Oelf\fancy@offself
\fancy@gbl\let\fancy@Oorf\hss
\fancy@gbl\let\fancy@Oerf\hss}
\newif\iffootnote
\let\latex@makecol\@makecol
\def\@makecol{\ifvoid\footins\footnotetrue\else\footnotefalse\fi
\let\topfloat\@toplist\let\botfloat\@botlist\latex@makecol}
\def\iftopfloat#1#2{\ifx\topfloat\empty #2\else #1\fi}
\def\ifbotfloat#1#2{\ifx\botfloat\empty #2\else #1\fi}
\def\iffloatpage#1#2{\if@fcolmade #1\else #2\fi}
\newcommand{\fancypagestyle}[2]{%
\@namedef{ps@#1}{\let\fancy@gbl\relax#2\relax\ps@fancy}}

View File

@@ -1,5 +1,12 @@
%% File: `abbrvnat.bst'
%% A modification of `abbrv.bst' for use with natbib package
%% File: `icml2024.bst'
%% A modification of `plainnl.bst' for use with natbib package
%%
%% Copyright 2010 Hal Daum\'e III
%% Modified by J. Fürnkranz
%% - Changed labels from (X and Y, 2000) to (X & Y, 2000)
%% - Changed References to last name first and abbreviated first names.
%% Modified by Iain Murray 2018 (who suggests adopting a standard .bst in future...)
%% - Made it actually use abbreviated first names
%%
%% Copyright 1993-2007 Patrick W Daly
%% Max-Planck-Institut f\"ur Sonnensystemforschung
@@ -14,7 +21,7 @@
%% version 1 of the License, or any later version.
%%
% Version and source file information:
% \ProvidesFile{natbst.mbs}[2007/11/26 1.93 (PWD)]
% \ProvidesFile{icml2010.mbs}[2007/11/26 1.93 (PWD)]
%
% BibTeX `plainnat' family
% version 0.99b for BibTeX versions 0.99a or later,
@@ -219,8 +226,7 @@ FUNCTION {format.names}
s num.names$ 'numnames :=
numnames 'namesleft :=
{ namesleft #0 > }
% Formerly { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't :=
{ s nameptr "{ff }{vv }{ll}{, jj}" format.name$ 't :=
{ s nameptr "{vv~}{ll}{, jj}{, f.}" format.name$ 't :=
nameptr #1 >
{ namesleft #1 >
{ ", " * t * }
@@ -262,8 +268,8 @@ FUNCTION {format.editors}
{ "" }
{ editor format.names
editor num.names$ #1 >
{ ", editors" * }
{ ", editor" * }
{ " (eds.)" * }
{ " (ed.)" * }
if$
}
if$
@@ -272,32 +278,28 @@ FUNCTION {format.editors}
FUNCTION {format.isbn}
{ isbn empty$
{ "" }
% { new.block "ISBN " isbn * }
{ "" }
{ new.block "ISBN " isbn * }
if$
}
FUNCTION {format.issn}
{ issn empty$
{ "" }
% { new.block "ISSN " issn * }
{ "" }
{ new.block "ISSN " issn * }
if$
}
FUNCTION {format.url}
{ url empty$
{ "" }
% { new.block "URL \url{" url * "}" * }
{ "" }
{ new.block "URL \url{" url * "}" * }
if$
}
FUNCTION {format.doi}
{ doi empty$
{ "" }
% { new.block "\doi{" doi * "}" * }
{ "" }
{ new.block "\doi{" doi * "}" * }
if$
}
@@ -425,13 +427,12 @@ FUNCTION {format.date}
pop$ "" }
'skip$
if$
%% CR: Leave out months.
% month empty$
% 'skip$
% { month
% " " * swap$ *
% }
% if$
month empty$
'skip$
{ month
" " * swap$ *
}
if$
extra.label *
}
@@ -457,24 +458,20 @@ FUNCTION {either.or.check}
FUNCTION {format.bvolume}
{ volume empty$
{ "" }
%% CR: Don't show "volume 1234 of LNCS" etc.
% { "volume" volume tie.or.space.connect
% series empty$
% 'skip$
% { " of " * series emphasize * }
% if$
% "volume and number" number either.or.check
% }
{ "" }
{ "volume" volume tie.or.space.connect
series empty$
'skip$
{ " of " * series emphasize * }
if$
"volume and number" number either.or.check
}
if$
}
FUNCTION {format.number.series}
{ volume empty$
{ number empty$
%% CR: Leave out series information.
% { series field.or.null }
{ "" }
{ series field.or.null }
{ output.state mid.sentence =
{ "number" }
{ "Number" }
@@ -528,8 +525,8 @@ FUNCTION {format.pages}
{ pages empty$
{ "" }
{ pages multi.page.check
{ "pages" pages n.dashify tie.or.space.connect }
{ "page" pages tie.or.space.connect }
{ "pp.\ " pages n.dashify tie.or.space.connect }
{ "pp.\ " pages tie.or.space.connect }
if$
}
if$
@@ -603,13 +600,11 @@ FUNCTION {format.chapter.pages}
FUNCTION {format.in.ed.booktitle}
{ booktitle empty$
{ "" }
%% CR: Leave out editors even if the information is available.
% { editor empty$
% { "In " booktitle emphasize * }
% { "In " format.editors * ", " * booktitle emphasize * }
% if$
% }
{ editor empty$
{ "In " booktitle emphasize * }
{ "In " format.editors * ", " * booktitle emphasize * }
if$
}
if$
}
@@ -1024,13 +1019,13 @@ FUNCTION {unpublished}
FUNCTION {default.type} { misc }
MACRO {jan} {"Jan."}
MACRO {jan} {"January"}
MACRO {feb} {"Feb."}
MACRO {feb} {"February"}
MACRO {mar} {"Mar."}
MACRO {mar} {"March"}
MACRO {apr} {"Apr."}
MACRO {apr} {"April"}
MACRO {may} {"May"}
@@ -1038,58 +1033,58 @@ MACRO {jun} {"June"}
MACRO {jul} {"July"}
MACRO {aug} {"Aug."}
MACRO {aug} {"August"}
MACRO {sep} {"Sept."}
MACRO {sep} {"September"}
MACRO {oct} {"Oct."}
MACRO {oct} {"October"}
MACRO {nov} {"Nov."}
MACRO {nov} {"November"}
MACRO {dec} {"Dec."}
MACRO {dec} {"December"}
MACRO {acmcs} {"ACM Comput. Surv."}
MACRO {acmcs} {"ACM Computing Surveys"}
MACRO {acta} {"Acta Inf."}
MACRO {acta} {"Acta Informatica"}
MACRO {cacm} {"Commun. ACM"}
MACRO {cacm} {"Communications of the ACM"}
MACRO {ibmjrd} {"IBM J. Res. Dev."}
MACRO {ibmjrd} {"IBM Journal of Research and Development"}
MACRO {ibmsj} {"IBM Syst.~J."}
MACRO {ibmsj} {"IBM Systems Journal"}
MACRO {ieeese} {"IEEE Trans. Softw. Eng."}
MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
MACRO {ieeetc} {"IEEE Trans. Comput."}
MACRO {ieeetc} {"IEEE Transactions on Computers"}
MACRO {ieeetcad}
{"IEEE Trans. Comput.-Aided Design Integrated Circuits"}
{"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
MACRO {ipl} {"Inf. Process. Lett."}
MACRO {ipl} {"Information Processing Letters"}
MACRO {jacm} {"J.~ACM"}
MACRO {jacm} {"Journal of the ACM"}
MACRO {jcss} {"J.~Comput. Syst. Sci."}
MACRO {jcss} {"Journal of Computer and System Sciences"}
MACRO {scp} {"Sci. Comput. Programming"}
MACRO {scp} {"Science of Computer Programming"}
MACRO {sicomp} {"SIAM J. Comput."}
MACRO {sicomp} {"SIAM Journal on Computing"}
MACRO {tocs} {"ACM Trans. Comput. Syst."}
MACRO {tocs} {"ACM Transactions on Computer Systems"}
MACRO {tods} {"ACM Trans. Database Syst."}
MACRO {tods} {"ACM Transactions on Database Systems"}
MACRO {tog} {"ACM Trans. Gr."}
MACRO {tog} {"ACM Transactions on Graphics"}
MACRO {toms} {"ACM Trans. Math. Softw."}
MACRO {toms} {"ACM Transactions on Mathematical Software"}
MACRO {toois} {"ACM Trans. Office Inf. Syst."}
MACRO {toois} {"ACM Transactions on Office Information Systems"}
MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."}
MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
MACRO {tcs} {"Theoretical Comput. Sci."}
MACRO {tcs} {"Theoretical Computer Science"}
READ
@@ -1120,7 +1115,7 @@ FUNCTION {format.lab.names}
'skip$
{ s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
{ " et~al." * }
{ " and " * s #2 "{vv~}{ll}" format.name$ * }
{ " \& " * s #2 "{vv~}{ll}" format.name$ * }
if$
}
if$

805
icml2024.sty Normal file
View File

@@ -0,0 +1,805 @@
% File: icml2024.sty (LaTeX style file for ICML-2024, version of 2023-11-23)
% This file contains the LaTeX formatting parameters for a two-column
% conference proceedings that is 8.5 inches wide by 11 inches high.
%
% Modified by Jonathan Scarlett 2024: changed years, volume, location
%
% Modified by Sivan Sabato 2023: changed years and volume number.
% Modified by Jonathan Scarlett 2023: added page numbers to every page
%
% Modified by Csaba Szepesvari 2022: changed years, PMLR ref. Turned off checking marginparwidth
% as marginparwidth only controls the space available for margin notes and margin notes
% will NEVER be used anyways in submitted versions, so there is no reason one should
% check whether marginparwidth has been tampered with.
% Also removed pdfview=FitH from hypersetup as it did not do its job; the default choice is a bit better
% but of course the double-column format is not supported by this hyperlink preview functionality
% in a completely satisfactory fashion.
% Modified by Gang Niu 2022: Changed color to xcolor
%
% Modified by Iain Murray 2018: changed years, location. Remove affiliation notes when anonymous.
% Move times dependency from .tex to .sty so fewer people delete it.
%
% Modified by Daniel Roy 2017: changed byline to use footnotes for affiliations, and removed emails
%
% Modified by Percy Liang 12/2/2013: changed the year, location from the previous template for ICML 2014
% Modified by Fei Sha 9/2/2013: changed the year, location form the previous template for ICML 2013
%
% Modified by Fei Sha 4/24/2013: (1) remove the extra whitespace after the first author's email address (in %the camera-ready version) (2) change the Proceeding ... of ICML 2010 to 2014 so PDF's metadata will show up % correctly
%
% Modified by Sanjoy Dasgupta, 2013: changed years, location
%
% Modified by Francesco Figari, 2012: changed years, location
%
% Modified by Christoph Sawade and Tobias Scheffer, 2011: added line
% numbers, changed years
%
% Modified by Hal Daume III, 2010: changed years, added hyperlinks
%
% Modified by Kiri Wagstaff, 2009: changed years
%
% Modified by Sam Roweis, 2008: changed years
%
% Modified by Ricardo Silva, 2007: update of the ifpdf verification
%
% Modified by Prasad Tadepalli and Andrew Moore, merely changing years.
%
% Modified by Kristian Kersting, 2005, based on Jennifer Dy's 2004 version
% - running title. If the original title is to long or is breaking a line,
% use \icmltitlerunning{...} in the preamble to supply a shorter form.
% Added fancyhdr package to get a running head.
% - Updated to store the page size because pdflatex does compile the
% page size into the pdf.
%
% Hacked by Terran Lane, 2003:
% - Updated to use LaTeX2e style file conventions (ProvidesPackage,
% etc.)
% - Added an ``appearing in'' block at the base of the first column
% (thus keeping the ``appearing in'' note out of the bottom margin
% where the printer should strip in the page numbers).
% - Added a package option [accepted] that selects between the ``Under
% review'' notice (default, when no option is specified) and the
% ``Appearing in'' notice (for use when the paper has been accepted
% and will appear).
%
% Originally created as: ml2k.sty (LaTeX style file for ICML-2000)
% by P. Langley (12/23/99)
%%%%%%%%%%%%%%%%%%%%
%% This version of the style file supports both a ``review'' version
%% and a ``final/accepted'' version. The difference is only in the
%% text that appears in the note at the bottom of the first column of
%% the first page. The default behavior is to print a note to the
%% effect that the paper is under review and don't distribute it. The
%% final/accepted version prints an ``Appearing in'' note. To get the
%% latter behavior, in the calling file change the ``usepackage'' line
%% from:
%% \usepackage{icml2024}
%% to
%% \usepackage[accepted]{icml2024}
%%%%%%%%%%%%%%%%%%%%
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{icml2024}[2023/11/23 v2.0 ICML Conference Style File]
% Before 2018, \usepackage{times} was in the example TeX, but inevitably
% not everybody did it.
\RequirePackage{times}
% Use fancyhdr package
\RequirePackage{fancyhdr}
\RequirePackage{xcolor} % changed from color to xcolor (2021/11/24)
\RequirePackage{algorithm}
\RequirePackage{algorithmic}
\RequirePackage{natbib}
\RequirePackage{eso-pic} % used by \AddToShipoutPicture
\RequirePackage{forloop}
\RequirePackage{url}
%%%%%%%% Options
\DeclareOption{accepted}{%
\renewcommand{\Notice@String}{\ICML@appearing}
\gdef\isaccepted{1}
}
\DeclareOption{nohyperref}{%
\gdef\nohyperref{1}
}
%%%%%%%%%%%%%%%%%%%%
% This string is printed at the bottom of the page for the
% final/accepted version of the ``appearing in'' note. Modify it to
% change that text.
%%%%%%%%%%%%%%%%%%%%
\newcommand{\ICML@appearing}{\textit{Proceedings of the
$\mathit{41}^{st}$ International Conference on Machine Learning},
Vienna, Austria. PMLR 235, 2024.
Copyright 2024 by the author(s).}
%%%%%%%%%%%%%%%%%%%%
% This string is printed at the bottom of the page for the draft/under
% review version of the ``appearing in'' note. Modify it to change
% that text.
%%%%%%%%%%%%%%%%%%%%
\newcommand{\Notice@String}{Preliminary work. Under review by the
International Conference on Machine Learning (ICML)\@. Do not distribute.}
% Cause the declared options to actually be parsed and activated
\ProcessOptions\relax
\ifdefined\isaccepted\else\ifdefined\hypersetup
\hypersetup{pdfauthor={Anonymous Authors}}
\fi
\fi
\ifdefined\nohyperref\else\ifdefined\hypersetup
\definecolor{mydarkblue}{rgb}{0,0.08,0.45}
\hypersetup{ %
pdftitle={},
pdfsubject={Proceedings of the International Conference on Machine Learning 2024},
pdfkeywords={},
pdfborder=0 0 0,
pdfpagemode=UseNone,
colorlinks=true,
linkcolor=mydarkblue,
citecolor=mydarkblue,
filecolor=mydarkblue,
urlcolor=mydarkblue,
}
\fi
\fi
% Uncomment the following for debugging. It will cause LaTeX to dump
% the version of the ``appearing in'' string that will actually appear
% in the document.
%\typeout{>> Notice string='\Notice@String'}
% Change citation commands to be more like old ICML styles
\newcommand{\yrcite}[1]{\citeyearpar{#1}}
\renewcommand{\cite}[1]{\citep{#1}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% to ensure the letter format is used. pdflatex does compile the
% page size into the pdf. This is done using \pdfpagewidth and
% \pdfpageheight. As Latex does not know this directives, we first
% check whether pdflatex or latex is used.
%
% Kristian Kersting 2005
%
% in order to account for the more recent use of pdfetex as the default
% compiler, I have changed the pdf verification.
%
% Ricardo Silva 2007
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\paperwidth=8.5in
\paperheight=11in
% old PDFLaTex verification, circa 2005
%
%\newif\ifpdf\ifx\pdfoutput\undefined
% \pdffalse % we are not running PDFLaTeX
%\else
% \pdfoutput=1 % we are running PDFLaTeX
% \pdftrue
%\fi
\newif\ifpdf %adapted from ifpdf.sty
\ifx\pdfoutput\undefined
\else
\ifx\pdfoutput\relax
\else
\ifcase\pdfoutput
\else
\pdftrue
\fi
\fi
\fi
\ifpdf
% \pdfpagewidth=\paperwidth
% \pdfpageheight=\paperheight
\setlength{\pdfpagewidth}{8.5in}
\setlength{\pdfpageheight}{11in}
\fi
% Physical page layout
\evensidemargin -0.23in
\oddsidemargin -0.23in
\setlength\textheight{9.0in}
\setlength\textwidth{6.75in}
\setlength\columnsep{0.25in}
\setlength\headheight{10pt}
\setlength\headsep{10pt}
\addtolength{\topmargin}{-20pt}
\addtolength{\topmargin}{-0.29in}
% Historically many authors tried to include packages like geometry or fullpage,
% which change the page layout. It either makes the proceedings inconsistent, or
% wastes organizers' time chasing authors. So let's nip these problems in the
% bud here. -- Iain Murray 2018.
%\RequirePackage{printlen}
\AtBeginDocument{%
% To get the numbers below, include printlen package above and see lengths like this:
%\printlength\oddsidemargin\\
%\printlength\headheight\\
%\printlength\textheight\\
%\printlength\marginparsep\\
%\printlength\footskip\\
%\printlength\hoffset\\
%\printlength\paperwidth\\
%\printlength\topmargin\\
%\printlength\headsep\\
%\printlength\textwidth\\
%\printlength\marginparwidth\\
%\printlength\marginparpush\\
%\printlength\voffset\\
%\printlength\paperheight\\
%
\newif\ifmarginsmessedwith
\marginsmessedwithfalse
\ifdim\oddsidemargin=-16.62178pt \else oddsidemargin has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\headheight=10.0pt \else headheight has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\textheight=650.43pt \else textheight has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\marginparsep=11.0pt \else marginparsep has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\footskip=25.0pt \else footskip has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\hoffset=0.0pt \else hoffset has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\paperwidth=614.295pt \else paperwidth has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\topmargin=-24.95781pt \else topmargin has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\headsep=10.0pt \else headsep has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\textwidth=487.8225pt \else textwidth has been altered.\\ \marginsmessedwithtrue\fi
%\ifdim\marginparwidth=65.0pt \else marginparwidth has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\marginparpush=5.0pt \else marginparpush has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\voffset=0.0pt \else voffset has been altered.\\ \marginsmessedwithtrue\fi
\ifdim\paperheight=794.96999pt \else paperheight has been altered.\\ \marginsmessedwithtrue\fi
\ifmarginsmessedwith
\textbf{\large \em The page layout violates the ICML style.}
Please do not change the page layout, or include packages like geometry,
savetrees, or fullpage, which change it for you.
We're not able to reliably undo arbitrary changes to the style. Please remove
the offending package(s), or layout-changing commands and try again.
\fi}
%% The following is adapted from code in the acmconf.sty conference
%% style file. The constants in it are somewhat magical, and appear
%% to work well with the two-column format on US letter paper that
%% ICML uses, but will break if you change that layout, or if you use
%% a longer block of text for the copyright notice string. Fiddle with
%% them if necessary to get the block to fit/look right.
%%
%% -- Terran Lane, 2003
%%
%% The following comments are included verbatim from acmconf.sty:
%%
%%% This section (written by KBT) handles the 1" box in the lower left
%%% corner of the left column of the first page by creating a picture,
%%% and inserting the predefined string at the bottom (with a negative
%%% displacement to offset the space allocated for a non-existent
%%% caption).
%%%
\def\ftype@copyrightbox{8}
\def\@copyrightspace{
% Create a float object positioned at the bottom of the column. Note
% that because of the mystical nature of floats, this has to be called
% before the first column is populated with text (e.g., from the title
% or abstract blocks). Otherwise, the text will force the float to
% the next column. -- TDRL.
\@float{copyrightbox}[b]
\begin{center}
\setlength{\unitlength}{1pc}
\begin{picture}(20,1.5)
% Create a line separating the main text from the note block.
% 4.818pc==0.8in.
\put(0,2.5){\line(1,0){4.818}}
% Insert the text string itself. Note that the string has to be
% enclosed in a parbox -- the \put call needs a box object to
% position. Without the parbox, the text gets splattered across the
% bottom of the page semi-randomly. The 19.75pc distance seems to be
% the width of the column, though I can't find an appropriate distance
% variable to substitute here. -- TDRL.
\put(0,0){\parbox[b]{19.75pc}{\small \Notice@String}}
\end{picture}
\end{center}
\end@float}
% Note: A few Latex versions need the next line instead of the former.
% \addtolength{\topmargin}{0.3in}
% \setlength\footheight{0pt}
\setlength\footskip{25.0pt}
%\pagestyle{empty}
\flushbottom \twocolumn
\sloppy
% Clear out the addcontentsline command
\def\addcontentsline#1#2#3{}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% commands for formatting paper title, author names, and addresses.
%%start%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%% title as running head -- Kristian Kersting 2005 %%%%%%%%%%%%%
%\makeatletter
%\newtoks\mytoksa
%\newtoks\mytoksb
%\newcommand\addtomylist[2]{%
% \mytoksa\expandafter{#1}%
% \mytoksb{#2}%
% \edef#1{\the\mytoksa\the\mytoksb}%
%}
%\makeatother
% box to check the size of the running head
\newbox\titrun
% general page style
\pagestyle{fancy}
\fancyhf{}
\fancyhead{}
\fancyfoot{}
\cfoot{\thepage}
% set the width of the head rule to 1 point
\renewcommand{\headrulewidth}{1pt}
% definition to set the head as running head in the preamble
\def\icmltitlerunning#1{\gdef\@icmltitlerunning{#1}}
% main definition adapting \icmltitle from 2004
\long\def\icmltitle#1{%
%check whether @icmltitlerunning exists
% if not \icmltitle is used as running head
\ifx\undefined\@icmltitlerunning%
\gdef\@icmltitlerunning{#1}
\fi
%add it to pdf information
\ifdefined\nohyperref\else\ifdefined\hypersetup
\hypersetup{pdftitle={#1}}
\fi\fi
%get the dimension of the running title
\global\setbox\titrun=\vbox{\small\bf\@icmltitlerunning}
% error flag
\gdef\@runningtitleerror{0}
% running title too long
\ifdim\wd\titrun>\textwidth%
{\gdef\@runningtitleerror{1}}%
% running title breaks a line
\else\ifdim\ht\titrun>6.25pt
{\gdef\@runningtitleerror{2}}%
\fi
\fi
% if there is somthing wrong with the running title
\ifnum\@runningtitleerror>0
\typeout{}%
\typeout{}%
\typeout{*******************************************************}%
\typeout{Title exceeds size limitations for running head.}%
\typeout{Please supply a shorter form for the running head}
\typeout{with \string\icmltitlerunning{...}\space prior to \string\begin{document}}%
\typeout{*******************************************************}%
\typeout{}%
\typeout{}%
% set default running title
\chead{\small\bf Title Suppressed Due to Excessive Size}%
\else
% 'everything' fine, set provided running title
\chead{\small\bf\@icmltitlerunning}%
\fi
% no running title on the first page of the paper
\thispagestyle{plain}
%%%%%%%%%%%%%%%%%%%% Kristian Kersting %%%%%%%%%%%%%%%%%%%%%%%%%
%end%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\center\baselineskip 18pt
\toptitlebar{\Large\bf #1}\bottomtitlebar}
}
\gdef\icmlfullauthorlist{}
\newcommand\addstringtofullauthorlist{\g@addto@macro\icmlfullauthorlist}
\newcommand\addtofullauthorlist[1]{%
\ifdefined\icmlanyauthors%
\addstringtofullauthorlist{, #1}%
\else%
\addstringtofullauthorlist{#1}%
\gdef\icmlanyauthors{1}%
\fi%
% \ifdefined\nohyperref\else
\ifdefined\hypersetup%
\hypersetup{pdfauthor=\icmlfullauthorlist}%
\fi%\fi
}
\def\toptitlebar{\hrule height1pt \vskip .25in}
\def\bottomtitlebar{\vskip .22in \hrule height1pt \vskip .3in}
\newenvironment{icmlauthorlist}{%
\setlength\topsep{0pt}
\setlength\parskip{0pt}
\begin{center}
}{%
\end{center}
}
\newcounter{@affiliationcounter}
\newcommand{\@pa}[1]{%
% ``#1''
\ifcsname the@affil#1\endcsname
% do nothing
\else
\ifcsname @icmlsymbol#1\endcsname
% nothing
\else
\stepcounter{@affiliationcounter}%
\newcounter{@affil#1}%
\setcounter{@affil#1}{\value{@affiliationcounter}}%
\fi
\fi%
\ifcsname @icmlsymbol#1\endcsname
\textsuperscript{\csname @icmlsymbol#1\endcsname\,}%
\else
%\expandafter\footnotemark[\arabic{@affil#1}\,]%
\textsuperscript{\arabic{@affil#1}\,}%
\fi
}
%\newcommand{\icmlauthor}[2]{%
%\addtofullauthorlist{#1}%
%#1\@for\theaffil:=#2\do{\pa{\theaffil}}%
%}
\newcommand{\icmlauthor}[2]{%
\ifdefined\isaccepted
\mbox{\bf #1}\,\@for\theaffil:=#2\do{\@pa{\theaffil}} \addtofullauthorlist{#1}%
\else
\ifdefined\@icmlfirsttime
\else
\gdef\@icmlfirsttime{1}
\mbox{\bf Anonymous Authors}\@pa{@anon} \addtofullauthorlist{Anonymous Authors}
\fi
\fi
}
\newcommand{\icmlsetsymbol}[2]{%
\expandafter\gdef\csname @icmlsymbol#1\endcsname{#2}
}
\newcommand{\icmlaffiliation}[2]{%
\ifdefined\isaccepted
\ifcsname the@affil#1\endcsname
\expandafter\gdef\csname @affilname\csname the@affil#1\endcsname\endcsname{#2}%
\else
{\bf AUTHORERR: Error in use of \textbackslash{}icmlaffiliation command. Label ``#1'' not mentioned in some \textbackslash{}icmlauthor\{author name\}\{labels here\} command beforehand. }
\typeout{}%
\typeout{}%
\typeout{*******************************************************}%
\typeout{Affiliation label undefined. }%
\typeout{Make sure \string\icmlaffiliation\space follows }
\typeout{all of \string\icmlauthor\space commands}%
\typeout{*******************************************************}%
\typeout{}%
\typeout{}%
\fi
\else % \isaccepted
% can be called multiple times... it's idempotent
\expandafter\gdef\csname @affilname1\endcsname{Anonymous Institution, Anonymous City, Anonymous Region, Anonymous Country}
\fi
}
\newcommand{\icmlcorrespondingauthor}[2]{
\ifdefined\isaccepted
\ifdefined\icmlcorrespondingauthor@text
\g@addto@macro\icmlcorrespondingauthor@text{, #1 \textless{}#2\textgreater{}}
\else
\gdef\icmlcorrespondingauthor@text{#1 \textless{}#2\textgreater{}}
\fi
\else
\gdef\icmlcorrespondingauthor@text{Anonymous Author \textless{}anon.email@domain.com\textgreater{}}
\fi
}
\newcommand{\icmlEqualContribution}{\textsuperscript{*}Equal contribution }
\newcounter{@affilnum}
\newcommand{\printAffiliationsAndNotice}[1]{%
\stepcounter{@affiliationcounter}%
{\let\thefootnote\relax\footnotetext{\hspace*{-\footnotesep}\ifdefined\isaccepted #1\fi%
\forloop{@affilnum}{1}{\value{@affilnum} < \value{@affiliationcounter}}{
\textsuperscript{\arabic{@affilnum}}\ifcsname @affilname\the@affilnum\endcsname%
\csname @affilname\the@affilnum\endcsname%
\else
{\bf AUTHORERR: Missing \textbackslash{}icmlaffiliation.}
\fi
}.
\ifdefined\icmlcorrespondingauthor@text
Correspondence to: \icmlcorrespondingauthor@text.
\else
{\bf AUTHORERR: Missing \textbackslash{}icmlcorrespondingauthor.}
\fi
\ \\
\Notice@String
}
}
}
%\makeatother
\long\def\icmladdress#1{%
{\bf The \textbackslash{}icmladdress command is no longer used. See the example\_paper PDF .tex for usage of \textbackslash{}icmlauther and \textbackslash{}icmlaffiliation.}
}
%% keywords as first class citizens
\def\icmlkeywords#1{%
% \ifdefined\isaccepted \else
% \par {\bf Keywords:} #1%
% \fi
% \ifdefined\nohyperref\else\ifdefined\hypersetup
% \hypersetup{pdfkeywords={#1}}
% \fi\fi
% \ifdefined\isaccepted \else
% \par {\bf Keywords:} #1%
% \fi
\ifdefined\nohyperref\else\ifdefined\hypersetup
\hypersetup{pdfkeywords={#1}}
\fi\fi
}
% modification to natbib citations
\setcitestyle{authoryear,round,citesep={;},aysep={,},yysep={;}}
% Redefinition of the abstract environment.
\renewenvironment{abstract}
{%
% Insert the ``appearing in'' copyright notice.
%\@copyrightspace
\centerline{\large\bf Abstract}
\vspace{-0.12in}\begin{quote}}
{\par\end{quote}\vskip 0.12in}
% numbered section headings with different treatment of numbers
\def\@startsection#1#2#3#4#5#6{\if@noskipsec \leavevmode \fi
\par \@tempskipa #4\relax
\@afterindenttrue
% Altered the following line to indent a section's first paragraph.
% \ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \@afterindentfalse\fi
\ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \fi
\if@nobreak \everypar{}\else
\addpenalty{\@secpenalty}\addvspace{\@tempskipa}\fi \@ifstar
{\@ssect{#3}{#4}{#5}{#6}}{\@dblarg{\@sict{#1}{#2}{#3}{#4}{#5}{#6}}}}
\def\@sict#1#2#3#4#5#6[#7]#8{\ifnum #2>\c@secnumdepth
\def\@svsec{}\else
\refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname}\fi
\@tempskipa #5\relax
\ifdim \@tempskipa>\z@
\begingroup #6\relax
\@hangfrom{\hskip #3\relax\@svsec.~}{\interlinepenalty \@M #8\par}
\endgroup
\csname #1mark\endcsname{#7}\addcontentsline
{toc}{#1}{\ifnum #2>\c@secnumdepth \else
\protect\numberline{\csname the#1\endcsname}\fi
#7}\else
\def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
{#7}\addcontentsline
{toc}{#1}{\ifnum #2>\c@secnumdepth \else
\protect\numberline{\csname the#1\endcsname}\fi
#7}}\fi
\@xsect{#5}}
\def\@sect#1#2#3#4#5#6[#7]#8{\ifnum #2>\c@secnumdepth
\def\@svsec{}\else
\refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname\hskip 0.4em }\fi
\@tempskipa #5\relax
\ifdim \@tempskipa>\z@
\begingroup #6\relax
\@hangfrom{\hskip #3\relax\@svsec}{\interlinepenalty \@M #8\par}
\endgroup
\csname #1mark\endcsname{#7}\addcontentsline
{toc}{#1}{\ifnum #2>\c@secnumdepth \else
\protect\numberline{\csname the#1\endcsname}\fi
#7}\else
\def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
{#7}\addcontentsline
{toc}{#1}{\ifnum #2>\c@secnumdepth \else
\protect\numberline{\csname the#1\endcsname}\fi
#7}}\fi
\@xsect{#5}}
% section headings with less space above and below them
\def\thesection {\arabic{section}}
\def\thesubsection {\thesection.\arabic{subsection}}
\def\section{\@startsection{section}{1}{\z@}{-0.12in}{0.02in}
{\large\bf\raggedright}}
\def\subsection{\@startsection{subsection}{2}{\z@}{-0.10in}{0.01in}
{\normalsize\bf\raggedright}}
\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-0.08in}{0.01in}
{\normalsize\sc\raggedright}}
\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
0.5ex minus .2ex}{-1em}{\normalsize\bf}}
\def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus
0.5ex minus .2ex}{-1em}{\normalsize\bf}}
% Footnotes
\footnotesep 6.65pt %
\skip\footins 9pt
\def\footnoterule{\kern-3pt \hrule width 0.8in \kern 2.6pt }
\setcounter{footnote}{0}
% Lists and paragraphs
\parindent 0pt
\topsep 4pt plus 1pt minus 2pt
\partopsep 1pt plus 0.5pt minus 0.5pt
\itemsep 2pt plus 1pt minus 0.5pt
\parsep 2pt plus 1pt minus 0.5pt
\parskip 6pt
\leftmargin 2em \leftmargini\leftmargin \leftmarginii 2em
\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em
\leftmarginvi .5em
\labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt
\def\@listi{\leftmargin\leftmargini}
\def\@listii{\leftmargin\leftmarginii
\labelwidth\leftmarginii\advance\labelwidth-\labelsep
\topsep 2pt plus 1pt minus 0.5pt
\parsep 1pt plus 0.5pt minus 0.5pt
\itemsep \parsep}
\def\@listiii{\leftmargin\leftmarginiii
\labelwidth\leftmarginiii\advance\labelwidth-\labelsep
\topsep 1pt plus 0.5pt minus 0.5pt
\parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
\itemsep \topsep}
\def\@listiv{\leftmargin\leftmarginiv
\labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
\def\@listv{\leftmargin\leftmarginv
\labelwidth\leftmarginv\advance\labelwidth-\labelsep}
\def\@listvi{\leftmargin\leftmarginvi
\labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
\abovedisplayskip 7pt plus2pt minus5pt%
\belowdisplayskip \abovedisplayskip
\abovedisplayshortskip 0pt plus3pt%
\belowdisplayshortskip 4pt plus3pt minus3pt%
% Less leading in most fonts (due to the narrow columns)
% The choices were between 1-pt and 1.5-pt leading
\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
\def\small{\@setsize\small{10pt}\ixpt\@ixpt}
\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
\def\large{\@setsize\large{14pt}\xiipt\@xiipt}
\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
% Revised formatting for figure captions and table titles.
\newsavebox\newcaptionbox\newdimen\newcaptionboxwid
\long\def\@makecaption#1#2{
\vskip 10pt
\baselineskip 11pt
\setbox\@tempboxa\hbox{#1. #2}
\ifdim \wd\@tempboxa >\hsize
\sbox{\newcaptionbox}{\small\sl #1.~}
\newcaptionboxwid=\wd\newcaptionbox
\usebox\newcaptionbox {\footnotesize #2}
% \usebox\newcaptionbox {\small #2}
\else
\centerline{{\small\sl #1.} {\small #2}}
\fi}
\def\fnum@figure{Figure \thefigure}
\def\fnum@table{Table \thetable}
% Strut macros for skipping spaces above and below text in tables.
\def\abovestrut#1{\rule[0in]{0in}{#1}\ignorespaces}
\def\belowstrut#1{\rule[-#1]{0in}{#1}\ignorespaces}
\def\abovespace{\abovestrut{0.20in}}
\def\aroundspace{\abovestrut{0.20in}\belowstrut{0.10in}}
\def\belowspace{\belowstrut{0.10in}}
% Various personal itemization commands.
\def\texitem#1{\par\noindent\hangindent 12pt
\hbox to 12pt {\hss #1 ~}\ignorespaces}
\def\icmlitem{\texitem{$\bullet$}}
% To comment out multiple lines of text.
\long\def\comment#1{}
%% Line counter (not in final version). Adapted from NIPS style file by Christoph Sawade
% Vertical Ruler
% This code is, largely, from the CVPR 2010 conference style file
% ----- define vruler
\makeatletter
\newbox\icmlrulerbox
\newcount\icmlrulercount
\newdimen\icmlruleroffset
\newdimen\cv@lineheight
\newdimen\cv@boxheight
\newbox\cv@tmpbox
\newcount\cv@refno
\newcount\cv@tot
% NUMBER with left flushed zeros \fillzeros[<WIDTH>]<NUMBER>
\newcount\cv@tmpc@ \newcount\cv@tmpc
\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
\cv@tmpc=1 %
\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
\ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
\ifnum#2<0\advance\cv@tmpc1\relax-\fi
\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
\def\makevruler[#1][#2][#3][#4][#5]{
\begingroup\offinterlineskip
\textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt%
\global\setbox\icmlrulerbox=\vbox to \textheight{%
{
\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight
\cv@lineheight=#1\global\icmlrulercount=#2%
\cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2%
\cv@refno1\vskip-\cv@lineheight\vskip1ex%
\loop\setbox\cv@tmpbox=\hbox to0cm{ % side margin
\hfil {\hfil\fillzeros[#4]\icmlrulercount}
}%
\ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break
\advance\cv@refno1\global\advance\icmlrulercount#3\relax
\ifnum\cv@refno<\cv@tot\repeat
}
}
\endgroup
}%
\makeatother
% ----- end of vruler
% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
\def\icmlruler#1{\makevruler[12pt][#1][1][3][\textheight]\usebox{\icmlrulerbox}}
\AddToShipoutPicture{%
\icmlruleroffset=\textheight
\advance\icmlruleroffset by 5.2pt % top margin
\color[rgb]{.7,.7,.7}
\ifdefined\isaccepted \else
\AtTextUpperLeft{%
\put(\LenToUnit{-35pt},\LenToUnit{-\icmlruleroffset}){%left ruler
\icmlruler{\icmlrulercount}}
% \put(\LenToUnit{1.04\textwidth},\LenToUnit{-\icmlruleroffset}){%right ruler
% \icmlruler{\icmlrulercount}}
}
\fi
}
\endinput

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.2 KiB

BIN
img/mask_expansion.pdf Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
img/size_bias_wide.pdf Normal file

Binary file not shown.

Binary file not shown.

111
main.bib
View File

@@ -1,111 +0,0 @@
@String(PAMI = {IEEE Trans. Pattern Anal. Mach. Intell.})
@String(IJCV = {Int. J. Comput. Vis.})
@String(CVPR= {IEEE Conf. Comput. Vis. Pattern Recog.})
@String(ICCV= {Int. Conf. Comput. Vis.})
@String(ECCV= {Eur. Conf. Comput. Vis.})
@String(NIPS= {Adv. Neural Inform. Process. Syst.})
@String(ICPR = {Int. Conf. Pattern Recog.})
@String(BMVC= {Brit. Mach. Vis. Conf.})
@String(TOG= {ACM Trans. Graph.})
@String(TIP = {IEEE Trans. Image Process.})
@String(TVCG = {IEEE Trans. Vis. Comput. Graph.})
@String(TMM = {IEEE Trans. Multimedia})
@String(ACMMM= {ACM Int. Conf. Multimedia})
@String(ICME = {Int. Conf. Multimedia and Expo})
@String(ICASSP= {ICASSP})
@String(ICIP = {IEEE Int. Conf. Image Process.})
@String(ACCV = {ACCV})
@String(ICLR = {Int. Conf. Learn. Represent.})
@String(IJCAI = {IJCAI})
@String(PR = {Pattern Recognition})
@String(AAAI = {AAAI})
@String(CVPRW= {IEEE Conf. Comput. Vis. Pattern Recog. Worksh.})
@String(CSVT = {IEEE Trans. Circuit Syst. Video Technol.})
@String(SPL = {IEEE Sign. Process. Letters})
@String(VR = {Vis. Res.})
@String(JOV = {J. Vis.})
@String(TVC = {The Vis. Comput.})
@String(JCST = {J. Comput. Sci. Tech.})
@String(CGF = {Comput. Graph. Forum})
@String(CVM = {Computational Visual Media})
@String(PAMI = {IEEE TPAMI})
@String(IJCV = {IJCV})
@String(CVPR = {CVPR})
@String(ICCV = {ICCV})
@String(ECCV = {ECCV})
@String(NIPS = {NeurIPS})
@String(ICPR = {ICPR})
@String(BMVC = {BMVC})
@String(TOG = {ACM TOG})
@String(TIP = {IEEE TIP})
@String(TVCG = {IEEE TVCG})
@String(TCSVT = {IEEE TCSVT})
@String(TMM = {IEEE TMM})
@String(ACMMM = {ACM MM})
@String(ICME = {ICME})
@String(ICASSP= {ICASSP})
@String(ICIP = {ICIP})
@String(ACCV = {ACCV})
@String(ICLR = {ICLR})
@String(IJCAI = {IJCAI})
@String(PR = {PR})
@String(AAAI = {AAAI})
@String(CVPRW= {CVPRW})
@String(CSVT = {IEEE TCSVT})
@misc{Authors14,
author = {FirstName LastName},
title = {The frobnicatable foo filter},
note = {Face and Gesture submission ID 324. Supplied as supplemental material {\tt fg324.pdf}},
year = 2014
}
@misc{Authors14b,
author = {FirstName LastName},
title = {Frobnication tutorial},
note = {Supplied as supplemental material {\tt tr.pdf}},
year = 2014
}
@article{Alpher02,
author = {FirstName Alpher},
title = {Frobnication},
journal = PAMI,
volume = 12,
number = 1,
pages = {234--778},
year = 2002
}
@article{Alpher03,
author = {FirstName Alpher and FirstName Fotheringham-Smythe},
title = {Frobnication revisited},
journal = {Journal of Foo},
volume = 13,
number = 1,
pages = {234--778},
year = 2003
}
@article{Alpher04,
author = {FirstName Alpher and FirstName Fotheringham-Smythe and FirstName Gamow},
title = {Can a machine frobnicate?},
journal = {Journal of Foo},
volume = 14,
number = 1,
pages = {234--778},
year = 2004
}
@inproceedings{Alpher05,
author = {FirstName Alpher and FirstName Gamow},
title = {Can a computer frobnicate?},
booktitle = CVPR,
pages = {234--778},
year = 2005
}

View File

@@ -1,94 +0,0 @@
\backcite {Sanderson2022}{{1}{1}{figure.caption.1}}
\backcite {Vezakis2024}{{1}{1}{figure.caption.1}}
\backcite {Wang2022b}{{1}{1}{figure.caption.1}}
\backcite {Carion2020}{{1}{1}{figure.caption.1}}
\backcite {Girshick2013}{{1}{1}{figure.caption.1}}
\backcite {He2017}{{1}{1}{figure.caption.1}}
\backcite {Dosovitskiy2021}{{1}{1}{figure.caption.1}}
\backcite {Liu2021}{{1}{1}{figure.caption.1}}
\backcite {Touvron2021b}{{1}{1}{figure.caption.1}}
\backcite {Khan2022}{{1}{1}{figure.caption.1}}
\backcite {Rangel2024}{{1}{1}{figure.caption.1}}
\backcite {Deng2009}{{1}{1}{figure.caption.1}}
\backcite {He2016}{{1}{1}{figure.caption.1}}
\backcite {Krizhevsky2012}{{1}{1}{figure.caption.1}}
\backcite {He2016}{{1}{1}{figure.caption.1}}
\backcite {Krizhevsky2012}{{1}{1}{figure.caption.1}}
\backcite {Touvron2022}{{1}{1}{figure.caption.1}}
\backcite {Wortsman2022}{{1}{1}{figure.caption.1}}
\backcite {Vaswani2017}{{1}{1}{figure.caption.1}}
\backcite {Dosovitskiy2021}{{1}{1}{figure.caption.1}}
\backcite {Carion2020}{{1}{1}{figure.caption.1}}
\backcite {Wang2022a}{{1}{1}{figure.caption.1}}
\backcite {Wortsman2022}{{1}{1}{figure.caption.1}}
\backcite {Yu2022}{{1}{1}{figure.caption.1}}
\backcite {Zong2022}{{1}{1}{figure.caption.1}}
\backcite {Shorten2019}{{1}{1}{figure.caption.1}}
\backcite {Xu2023d}{{1}{1}{figure.caption.1}}
\backcite {Ding2023a}{{1}{1}{figure.caption.1}}
\backcite {RojasGomez2023}{{1}{1}{figure.caption.1}}
\backcite {Ren2024}{{2}{1}{figure.caption.1}}
\backcite {Sun2024}{{2}{1}{figure.caption.1}}
\backcite {Suvorov2021}{{2}{1}{figure.caption.1}}
\backcite {Zhong2017}{{2}{2}{section*.3}}
\backcite {Liu2022d}{{2}{2}{section*.3}}
\backcite {Zhang2018a}{{2}{2}{section*.3}}
\backcite {Yun2019}{{2}{2}{section*.3}}
\backcite {Takahashi2018}{{2}{2}{section*.3}}
\backcite {Cubuk2018}{{2}{2}{section*.3}}
\backcite {Cubuk2019}{{2}{2}{section*.3}}
\backcite {Touvron2022}{{2}{2}{section*.3}}
\backcite {Shorten2019}{{2}{2}{section*.3}}
\backcite {Xu2023d}{{2}{2}{section*.3}}
\backcite {Ghiasi2020}{{2}{2}{section*.4}}
\backcite {Ghiasi2020}{{2}{2}{section*.4}}
\backcite {Shermaine2025}{{2}{2}{section*.4}}
\backcite {Ling2022}{{2}{2}{section*.4}}
\backcite {Werman2021}{{2}{2}{section*.4}}
\backcite {Hinterstoisser2019}{{2}{2}{section*.4}}
\backcite {Dwibedi2017}{{2}{2}{section*.4}}
\backcite {Ge2023}{{2}{2}{section*.4}}
\backcite {Kang2022}{{2}{2}{section*.4}}
\backcite {Hendrycks2019}{{2}{2}{section*.6}}
\backcite {Hendrycks2019}{{2}{2}{section*.6}}
\backcite {Li2023e}{{2}{2}{section*.6}}
\backcite {Zhang2024f}{{2}{2}{section*.6}}
\backcite {Geirhos2018}{{3}{2}{section*.6}}
\backcite {Xiao2020}{{3}{2}{section*.6}}
\backcite {Sun2024}{{3}{3.1}{subsection.3.1}}
\backcite {Ren2024}{{3}{3.1}{subsection.3.1}}
\backcite {Liu2023e}{{3}{3.1}{subsection.3.1}}
\backcite {Kirillov2023}{{3}{3.1}{subsection.3.1}}
\backcite {Suvorov2021}{{3}{3.1}{subsection.3.1}}
\backcite {Sun2024}{{3}{3.1}{subsection.3.1}}
\backcite {Touvron2022}{{4}{3.2}{subsection.3.2}}
\backcite {Suvorov2021}{{4}{1}{table.caption.7}}
\backcite {Suvorov2021}{{4}{1}{table.caption.7}}
\backcite {Sun2024}{{4}{1}{table.caption.7}}
\backcite {Le2015}{{4}{4.1}{subsection.4.1}}
\backcite {Sun2024}{{4}{4.1}{table.caption.8}}
\backcite {Suvorov2021}{{4}{4.1}{table.caption.8}}
\backcite {Suvorov2021}{{5}{2}{table.caption.8}}
\backcite {Bates1955}{{5}{4.1}{table.caption.9}}
\backcite {Nauen2025}{{5}{4.2}{table.caption.11}}
\backcite {Touvron2022}{{5}{4.2}{table.caption.11}}
\backcite {Dosovitskiy2021}{{5}{4.2}{table.caption.11}}
\backcite {Liu2021}{{5}{4.2}{table.caption.11}}
\backcite {He2016}{{5}{4.2}{table.caption.11}}
\backcite {Touvron2021b}{{5}{4.2}{table.caption.11}}
\backcite {Ge2023}{{6}{4.2}{table.caption.12}}
\backcite {Ghiasi2020}{{6}{4.2}{table.caption.12}}
\backcite {Shermaine2025}{{6}{4.2}{table.caption.12}}
\backcite {Maji2013}{{6}{4.2}{table.caption.13}}
\backcite {Dehghan2017}{{6}{4.2}{table.caption.13}}
\backcite {Nilsback2008}{{6}{4.2}{table.caption.13}}
\backcite {Kaur2017}{{6}{4.2}{table.caption.13}}
\backcite {Parkhi2012}{{6}{4.2}{table.caption.13}}
\backcite {Xiao2020}{{7}{4.3}{figure.caption.14}}
\backcite {Wang2024f}{{7}{4.3}{figure.caption.14}}
\backcite {Chattopadhay2018}{{7}{4.3}{figure.caption.15}}
\backcite {Selvaraju2016}{{7}{4.3}{figure.caption.15}}
\backcite {Sundararajan2017}{{7}{4.3}{figure.caption.15}}
\backcite {Selvaraju2016}{{7}{4.3}{figure.caption.15}}
\backcite {Chattopadhay2018}{{7}{4.3}{figure.caption.15}}
\backcite {Sundararajan2017}{{7}{4.3}{figure.caption.15}}

BIN
main.pdf

Binary file not shown.

502
main.tex
View File

@@ -1,70 +1,454 @@
% CVPR 2026 Paper Template; see https://github.com/cvpr-org/author-kit
%%%%%%%% ICML 2024 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%
\documentclass[10pt,twocolumn,letterpaper]{article}
\documentclass{article}
%%%%%%%%% PAPER TYPE - PLEASE UPDATE FOR FINAL VERSION
% \usepackage{cvpr} % To produce the CAMERA-READY version
\usepackage[review]{cvpr} % To produce the REVIEW version
% \usepackage[pagenumbers]{cvpr} % To force page numbers, e.g. for an arXiv version
% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2024} with \usepackage[nohyperref]{icml2024} above.
\usepackage{hyperref}
% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}
% Use the following line for the initial blind version submitted for review:
% \usepackage{icml2024}
% If accepted, instead use the following line for the camera-ready submission:
\usepackage[accepted]{icml2024}
% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
% It is strongly recommended to use hyperref, especially for the review version.
% hyperref with option pagebackref eases the reviewers' job.
% Please disable hyperref *only* if you encounter grave issues,
% e.g. with the file validation for the camera-ready version.
%
% If you comment hyperref and then uncomment it, you should delete *.aux before re-running LaTeX.
% (Or just hit 'q' on the first LaTeX run, let it finish, and you should be clear).
\definecolor{cvprblue}{rgb}{0.21,0.49,0.74}
\usepackage[pagebackref,breaklinks,colorlinks,allcolors=cvprblue]{hyperref}
\input{packages}
%%%%%%%%% PAPER ID - PLEASE UPDATE
\def\paperID{4792} % *** Enter the Paper ID here
\def\confName{CVPR}
\def\confYear{2026}
%%%%%%%%% TITLE - PLEASE UPDATE
\newcommand{\schemename}{\textit{ForAug}\xspace}
\title{\schemename: Mitigating Biases and Improving Vision Transformer Training by Recombining Foregrounds and Backgrounds}
% \title{\schemename: Mitigating Biases and Improving ViT Training by Recombining Foregrounds and Backgrounds}
% \title{\LaTeX\ Author Guidelines for \confName~Proceedings}
%%%%%%%%% AUTHORS - PLEASE UPDATE
\author{
Tobias Christian Nauen\textsuperscript{\rm 1,\rm 2},
Brian Moser\textsuperscript{\rm 2},
Federico Raue\textsuperscript{\rm 2},
Stanislav Frolov\textsuperscript{\rm 2},
Andreas Dengel\textsuperscript{\rm 1,\rm 2} \\
\textsuperscript{\rm 1}RPTU University Kaiserslautern-Landau, Kaiserslautern, Germany \\
\textsuperscript{\rm 2}German Research Center for Artificial Intelligence (DFKI), Kaiserslautern, Germany \\
{\tt\small first\_second.last@dfki.de / first.last@dfki.de}
% For a paper whose authors are all at the same institution,
% omit the following lines up until the closing ``}''.
% Additional authors and addresses can be added with ``\and'',
% just like the second author.
% To save space, use either the email address or home page, not both
}
% The \icmltitle you define below is probably too long as a header.
% Therefore, a short form for the running title is supplied here:
\icmltitlerunning{Segment \& Recombine}
\begin{document}
\maketitle
\input{sec/abstract}
\input{sec/intro}
\input{sec/related_work}
\input{sec/method}
\input{sec/experiments}
% \input{sec/future_work}
\input{sec/conclusion}
\input{sec/acks}
{
\small
\bibliographystyle{ieeenat_fullname}
\bibliography{../JabRef/main_bib}
}
% WARNING: do not forget to delete the supplementary pages from your submission
% \appendix
% \onecolumn
% \input{sec/appendix}
\twocolumn[
\icmltitle{RecombiNet: A dataset for better ImageNet}
% It is OKAY to include author information, even for blind
% submissions: the style file will automatically remove it for you
% unless you've provided the [accepted] option to the icml2024
% package.
% List of affiliations: The first argument should be a (short)
% identifier you will use later to specify author affiliations
% Academic affiliations should list Department, University, City, Region, Country
% Industry affiliations should list Company, City, Region, Country
% You can specify symbols, otherwise they are numbered in order.
% Ideally, you should not use this facility. Affiliations will be numbered
% in order of appearance and this is the preferred way.
\icmlsetsymbol{equal}{*}
\begin{icmlauthorlist}
\icmlauthor{Tobias Christian Nauen}{rptu,dfki}
\icmlauthor{Brian Moser}{dfki}
\icmlauthor{Federico Raue}{dfki}
\icmlauthor{Stansilav Frolov}{dfki}
\icmlauthor{Andreas Dengel}{rptu,dfki}
\end{icmlauthorlist}
\icmlaffiliation{rptu}{Department of Computer Science, RPTU Kaiserslautern-Landau, Kaiserslautern, Germany}
\icmlaffiliation{dfki}{German Research Center for Artificial Intelligence (DFKI), Kaiserslautern, Germany}
\icmlcorrespondingauthor{Tobias Christian Nauen}{tobias\_christian.nauen@dfki.de}
% You may provide any keywords that you
% find helpful for describing your paper; these are used to populate
% the "keywords" metadata in the PDF but will not be shown in the document
\icmlkeywords{Machine Learning, ICML}
\vskip 0.3in
]
% this must go after the closing bracket ] following \twocolumn[ ...
% This command actually creates the footnote in the first column
% listing the affiliations and the copyright notice.
% The command takes one argument, which is text to display at the start of the footnote.
% The \icmlEqualContribution command is standard text for equal contribution.
% Remove it (just {}) if you do not need this facility.
\printAffiliationsAndNotice{} % leave blank if no need to mention equal contribution
% \printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text.
\begin{abstract}
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet.
\end{abstract}
ImageNet \cite{Deng2009}
\section{Experiments}
\begin{itemize}
\item Train on all variants, then evaluate on all variants.
\item Train different models... ViT, ResNet, Swin, ? at multiple sizes.
\item Finetune for downstream tasks (classification).
\item Finetune for dense tasks / semantic segmentation.
\item Evaluate Diversity and Affinity from \cite{GontijoLopes2021}.
\item Intermediate bg-fg combinations between only \emph{same} and \emph{all}.
\item Train with different data augmentation setups on top. 3-Augment, AutoAugment, Real Guidance. With special focus on operations like Mixup \& CutMix.
\item Look into filtering foregrounds and backgrounds based on segmentation quality + size...
\end{itemize}
\section{Training on RecombinationNet}
\begin{table*}
\centering
\begin{tabular}{cccccccc}
\toprule
Model & Mode & \makecell{Foreground \\ Rotation} & \makecell{ImageNet \\ Accuracy [\%]} & \makecell{Recombine same \\ Accuracy [\%]} & \makecell{Recombine all \\ Accuracy [\%]} & \makecell{Backgrounds \\ Accuracy [\%]} \\
\midrule
ViT-S/16 & ImageNet & & 79.73 & 68.2 & 50.6 & 15.7 \\
\midrule
ViT-S/16 & same & $\pm 0$ & 82.3 & 82.0 & 67.9 & 27.2 \\
ViT-S/16 & same & $\pm 10$ & 82.2 & 81.9 & 67.9 & 27.3 \\
ViT-S/16 & same & $\pm 20$ & 82.2 & 82.0 & 67.9 & 27.0 \\
ViT-S/16 & same & $\pm 30$ & 82.2 & 82.1 & 67.8 & 27.0 \\
ViT-S/16 & all & $\pm 0$ & 76.8 & 76.4 & 76.5 & 03.1 \\
ViT-S/16 & all & $\pm 20$ & \\
ViT-S/16 & same & & \\
ViT-S/16 & same & & \\
ViT-S/16 & same & & \\
ViT-S/16 & same & & \\
\bottomrule
\end{tabular}
\caption{Training on RecombinationNet v24-10 (300 ep), evaluating on ImageNet.}
\end{table*}
\begin{table*}
\centering
\begin{tabular}{ccccccc}
\toprule
Model & Train DS & \makecell{ImageNet \\ Accuracy [\%]} & \makecell{Recombine same \\ Accuracy [\%]} & \makecell{Recombine all \\ Accuracy [\%]} & \makecell{Backgrounds \\ Accuracy [\%]} \\
\midrule
ViT-Ti/16 & IN1k & 76.1 & 64.5/67.3 & 47.3 & 12.8 \\
ViT-Ti/16 & RN same & 64.7 & 75.4 & 59.4 & 19.5 \\
ViT-Ti/16 & RN all & 53.5 & 70.6 & 70.6 & 03.2 \\
\midrule
ViT-S/16 & IN1k & 79.6 \\
ViT-S/16 & RN same & 69.7 & 82.0 & 67.9 & 27.0 \\
ViT-S/16 & RN all & 50.1 & 76.6 & 76.4 & 02.9 \\
\midrule
ViT-B/16 & IN1k & 78.0 & 65.9/68.8 & 48.2 & 16.1 \\
ViT-B/16 & RN same & 70.1 & 81.5 & 70.7 & 30.9 \\
ViT-B/16 & RN all & 44.9 & 75.9 & 76.0 & 02.5 \\
\midrule
Swin-Ti & IN1k & 77.9 & 66.5/69.2 & 48.9 & 15.6 \\
Swin-Ti & RN same & 63.6 & 81.4 & 70.1 & 29.1 \\
Swin-Ti & RN all & 09.4 & 76.9 & 75.7 & 00.6 \\
\midrule
Swin-S & IN1k & 79.7 & 67.9/70.9 & 50.5 & 16.8 \\
Swin-S & RN same & 65.0 & 82.5 & 72.4 & 34.1 \\
Swin-S & RN all & FAIL \\
\midrule
ResNet-34 & IN1k & 75.3 & 65.9/67.8 & 51.7 & 11.8 \\
ResNet-34 & RN same & 63.5 & 75.7 & 63.9 & 17.9 \\
ResNet-34 & RN all & 37.2 & 72.0 & 71.4 & 02.0 \\
\midrule
ResNet-50 & IN1k & 78.4 & 69.2/71.2 & 55.9 & 13.9 \\
ResNet-50 & RN same & 66.8 & 80.3 & 70.4 & 23.5 \\
ResNet-50 & RN all & 21.4 & 75.6 & 75.6 & 01.1 \\
\midrule
ResNet-101 & IN1k & 79.5 & 70.2/72.1 & 57.1 & 14.8 \\
ResNet-101 & RN same & 68.9 & 81.8 & 72.7 & 27.3 \\
ResNet-101 & RN all & 24.0 & 76.9 & 76.8 & 01.5 \\
\bottomrule
\end{tabular}
\caption{Training on RecombinationNet v24-10? (300 ep), evaluating on ImageNet.}
\end{table*}
\begin{table*}
\centering
\small
\begin{tabular}{llcccccc}
\toprule
& & \multicolumn{3}{c}{ViT-Ti} & \multicolumn{3}{c}{ResNet34} \\
\cmidrule(r){3-5} \cmidrule(l){6-8}
Version & Version ID & \makecell{TIN \\ Acc. [\%]} & \makecell{TRN same \\ max \\ Acc. [\%]} & \makecell{TRN same \\ max pt=.6 \\ Acc. [\%]} & \makecell{TIN \\ Acc. [\%]} & \makecell{TRN same \\ max \\ Acc. [\%]} & \makecell{TRN same \\ max pt=.6 \\ Acc. [\%]} \\
\midrule
24-01-10 & TINS & 66.7 & 60.1 & 60.7 & 77.9 & 71.4 & 73.2 \\
25-01-13 & TINS\_v2 & 66.7 & 61.0 & 62.4 & 77.9 & 72.2 & 74.2 \\
25-01-17 man & TINS\_v3\_f1 & 66.7 & 61.8 & 61.9 & 77.9 & 73.5 & 74.2 \\
25-01-17 auto & TINS\_v3\_f2 & 66.7 & 61.4 & 61.7 & 77.9 & 73.2 & 74.1 \\
25-01-24 first & TINS\_v4\_f1 & 66.7 & 62.7 & 62.7 & 77.9 & 73.6 & 74.9 \\
25-01-24 all man & TINS\_v5\_f1 & 66.7 & 61.9 & 62.3 & 77.9 & 73.7 & 74.5 \\
25-01-24 all auto & TINS\_v5\_f2 & 66.7 & 61.5 & 62.2 & 77.9 & 73.8 & 74.4 \\
25-02-03 all man & TINS\_v6\_f1 & 66.7 & 61.3 & 62.1 & 77.9 & 73.1 & 75.0 \\
25-02-04 all man & TINS\_v7\_f1 & 66.7 & & & 77.9 \\
\bottomrule
\end{tabular}
\caption{Training on TinyImageNet and evaluating on different TinyRecombNet versions.}
\end{table*}
\begin{table*}
\centering
\small
\begin{tabular}{clcccccccccc}
\toprule
& & ViT-Ti/16 & ViT-S/16 & ResNet34 & ResNet50 & ResNet101 \\
\cmidrule(r){3-3} \cmidrule(lr){4-4} \cmidrule(l){5-5} \cmidrule(l){6-6} \cmidrule(l){7-7}
Version & Train DS & \makecell{TIN \\ Acc. [\%]} & \makecell{TIN \\ Acc. [\%]} & \makecell{TIN \\ Acc. [\%]} & \makecell{TRN same \\ max \\ Acc. [\%]} & \makecell{TRN same \\ max pt=.6 \\ Acc. [\%]} \\
\midrule
% \multirow{4}{*}{\rot{\makecell{v25-01-10 \\ TINS}}} & TIN & 66.7 & 68.9 & 77.9 & \\
% & TRN same/range & \textbf{67.1} & \textbf{72.6} & 75.9 \\
% & TRN same/range pt=1.0 & 66.8 & \underline{\textbf{73.5}} & \underline{76.2} \\
% & TRN same/range pt=0.8 & 65.8 & & 75.7 \\
% \midrule
% \multirow{4}{*}{\rot{\makecell{v25-01-13 \\ TINS\_v2}}} & TIN & 66.7 & 68.9 & 77.9 & \\
% & TRN same/range & 65.0 & & \\
% & TRN same/range pt=1.0 & 65.9 & & 73.1 \\
% & TRN same/range pt=0.8 & 65.3 & & 73.9 \\
% \midrule
% \multirow{4}{*}{\rot{\makecell{v25-01-17 \\ man \\ TINS\_v3\_f1}}} & TIN & 66.7 & 68.9 & 77.9 & \\
% & TRN same/range & 65.9 & & 75.3 \\
% & TRN same/range pt=1.0 & 65.0 & & 75.6 \\
% & TRN same/range pt=0.8 & 64.2 & & 75.0 \\
% \midrule
% \multirow{4}{*}{\rot{\makecell{v25-01-17 \\ auto \\ TINS\_v3\_f2}}} & TIN & 66.7 & 68.9 & 77.9 & \\
% & TRN same/range & \textbf{67.0} & \textbf{73.0} & 76.1 \\
% & TRN same/range pt=1.0 & 65.4 & \textbf{72.1} & 75.9 \\
% & TRN same/range pt=0.8 & 65.4 & & 75.3 \\
% \midrule
% \multirow{4}{*}{\rot{\makecell{v25-01-24 \\ first \\ TINS\_v4\_f1}}} & TIN & 66.7 & 68.9 & 77.9 & \\
% & TRN same/range & 65.3 & \textbf{70.9} & 75.1 \\
% & TRN same/range pt=1.0 & 65.3 & \textbf{72.0} & 75.3 \\
% & TRN same/range pt=0.8 & 66.0 & \textbf{70.5} & 75.5 \\
% \midrule
\multirow{13}{*}{\rot{\makecell{v25-01-24 \\ all-man \\ TINS\_v5\_f1}}} & TIN & 66.7/65.7/65.9 & 68.9 & 77.9/77.9/78.1 & 79.1 & 79.8 \\
& TRN same/range & \textbf{66.7}/67.2/66.9 & \textbf{72.3} & 75.4 \\
& TRN same/range pt=1.0 & \textbf{67.5}/66.6 & \textbf{71.9} & 76.1 \\
& TRN same/range pt=0.8 & 65.8(?)/66.2 & \textbf{70.8} & 75.9 \\
\cmidrule{2-2}
& TRN same/range p$\to$t & \textbf{66.7}/66.6 & \textbf{72.1}/71.6/71.8 & 75.6 \\
& TRN same/range pt=1.0 p$\to$t & \textbf{66.9}/66.5/66.5 & \textbf{72.7}/73.1 & 75.8 \\
& TRN same/range pt=0.9 p$\to$t & \textbf{66.9} & \textbf{73.2} \\
& TRN same/range pt=0.8 p$\to$t & \makecell{\textbf{67.8}/64.7/67.1 \\ 66.6/66.3} & \makecell{\textbf{73.5}/70.1/72.2 \\ 71.7} & 75.9 \\
& TRN same/range pt=0.7 p$\to$t & \textbf{66.9} & \textbf{71.1} & 76.0 \\
& TRN same/range pt=0.6 p$\to$t & 66.4 & \textbf{72.0} & 76.3 \\
& TRN same/range pt=0.5 p$\to$t & 66.6 & \textbf{69.3} & 75.3 \\
\cmidrule{2-2}
& TRN all/range & 56.6 & 62.2 & 63.3 \\
\midrule
% \multirow{4}{*}{\rot{\makecell{v25-01-24 \\ all-auto \\ TINS\_v5\_f2}}} & TIN & 66.7 & 68.9 & 77.9 & \\
% & TRN same/range & 66.0(?) & \textbf{71.7} & 75.6 \\
% & TRN same/range pt=1.0 & 66.5 & \textbf{71.1} & 75.7 \\
% & TRN same/range pt=0.8 & 66.3 & \textbf{71.2} & 75.8 \\
% \midrule
\multirow{12}{*}{\rot{\makecell{v25-02-03 \\ all-man \\ TINS\_v6\_f1}}} & TIN & 66.7 & 68.9 & 77.9 \\
& TRN same/range & \textbf{67.3} & \textbf{72.9} & 76.0 \\
& TRN same/range pt=1.0 & \textbf{67.2} & \textbf{73.6} & 76.2 \\
& TRN same/range pt=0.8 & 66.6 & \textbf{73.1} & 75.5 \\
\cmidrule{2-2}
& TRN same/range p$\to$t & & \textbf{73.6/74.2} & 75.8 \\
& TRN same/range pt=1.0 p$\to$t & \textbf{67.4} & \underline{\textbf{74.2}} & 76.0 \\
& TRN same/range pt=0.9 p$\to$t & \textbf{67.4} & \textbf{74.1} & 76.0 \\
& TRN same/range pt=0.8 p$\to$t & \underline{\textbf{68.4}} & \textbf{72.4} & 76.0 \\
& TRN same/range pt=0.7 p$\to$t & & \textbf{72.2} & 75.7 \\
\cmidrule{2-2}
& TRN same/range p$\to$t ed blr 2 & 66.3 & \textbf{72.3} & 72.5 \\
& TRN same/range p$\to$t ed blr 4 & 65.3 & \textbf{69.8} & 74.5 \\
\midrule
% \multirow{10}{*}{\rot{\makecell{v25-02-04 \\ all-man \\ TINS\_v7\_f1}}} & TIN & 66.7 & 68.9 & 77.9 \\
% & TRN same/range & \textbf{68.2/66.8/67.0} & \textbf{/72.2/73.5} & 75.8 \\
% & TRN same/range pt=1.0 & \textbf{67.4} & \textbf{73.3} & 75.5 \\
% & TRN same/range pt=0.8 & \textbf{67.0} & \textbf{73.3} & 75.4 \\
% \cmidrule{2-2}
% & TRN same/range p$\to$t & \textbf{67.1/67.8/ } & \textbf{73.7/ /73.8} & \underline{76.2} \\
% & TRN same/range pt=1.0 p$\to$t & \textbf{68.0} & \textbf{73.5} & 75.5 \\
% & TRN same/range pt=0.9 p$\to$t & \textbf{67.8} & \textbf{73.2} & 75.8 \\
% & TRN same/range pt=0.8 p$\to$t & 66.4/\textbf{67.4/67.7} & \textbf{72.8/73.6} & 75.7 \\
% & TRN same/range pt=0.7 p$\to$t & \textbf{67.7} & \textbf{71.4} & 76.0 \\
% \midrule
\multirow{28}{*}{\rot{\makecell{v25-02-04 \\ all-man RETEST \\ TINS\_v7\_f1}}} & TIN & 66.7 & 68.9 & 77.9 \\
& TRN same/range & 68.7/66.4/67.5 & 72.1/73.0/72.2 \\
& TRN same/range & $67.5 \pm 1.2$ & $72.4 \pm 0.5$ \\
& TRN same/range pt=1.0 & 67.1/66.5/66.6 & 72.8/72.4/72.5 \\
& TRN same/range pt=1.0 & $66.7 \pm 0.2$ & $72.6 \pm 0.1$ \\
& TRN same/range pt=0.8 & 66.1/66.1/66.3 & 72.6/71.4/71.5 \\
& TRN same/range pt=0.8 & $66.2 \pm 0.1$ & $71.8 \pm 0.9$ \\
\cmidrule{2-2}
& TRN same/range p$\to$t & 68.5/66.1/66.8 & 73.0/72.3/73.3 \\
& TRN same/range p$\to$t & $67.1 \pm 1.2$ & $72.9 \pm 0.5$ \\
& TRN same/range pt=1.0 p$\to$t & 67.6/66.1/67.2 & 72.5/73.1/73.3 \\
& TRN same/range pt=1.0 p$\to$t & $67.0 \pm 1.2$ & $73.0 \pm 0.3$ \\
& TRN same/range pt=0.8 p$\to$t & 67.5/67.5/66.5 & 72.2/73.3/73.3 \\
& TRN same/range pt=0.8 p$\to$t & $67.2 \pm 0.7$ & $72.9 \pm 0.8$ \\
\cmidrule{2-2}
& TRN s/r p$\to$t pt=0.8 edgb=2 & 67.6 \\
& TRN s/r p$\to$t pt=0.8 edgb=4 & 65.8 \\
\cmidrule{2-2}
& TRN s/r p$\to$t pt=0.8 orig p=0.1 & 69.4 \\
& TRN s/r p$\to$t pt=0.8 orig p=0.2 & 70.4 \\
& TRN s/r p$\to$t pt=0.8 orig p=0.33 & 70.0 & 74.1 \\
& TRN s/r p$\to$t pt=0.8 orig p=0.5 & 70.0 \\
& TRN s/r p$\to$t pt=0.8 orig p=lin & 69.5 \\
& TRN s/r p$\to$t pt=0.8 orig p=invlin & 67.5 \\
& TRN s/r p$\to$t pt=0.8 orig p=cos & 71.3 \\
\cmidrule{2-2}
& TRN s/r p$\to$t pt=0.8 orig p=0.5 edgb=2 & 69.5 \\
& TRN s/r p$\to$t pt=0.8 orig p=0.5 edgb=4 & 70.6 \\
& TRN s/r p$\to$t pt=0.8 orig p=cos edgb=2 \\
& TRN s/r p$\to$t pt=0.8 orig p=cos edgb=4 \\
\bottomrule
\end{tabular}
\caption{Training on TinyRecombinationNet (300 ep), evaluating on ImageNet. TIN=TinyImageNet, TRN=TinyRecombNet, TBN=TinyBackgroundNet. Versions are for train and test sets. v25-02-03 is comparable to v25-01-17, v25-02-04 is comparable to v25-01-24.}
\end{table*}
\begin{table*}
\centering
\small
\begin{tabular}{llcccccccc}
\toprule
\multirow{2}{*}{Model} & Test Dataset $\rightarrow$ & \multicolumn{4}{c}{TinyRecombNet-5-1/same/range} & \multicolumn{4}{c}{TinyRecombNet-5-1/all/range} \\
\cmidrule(rl){3-6} \cmidrule(l){7-10}
& Train Dataset $\downarrow$ & IG & GradCAM & GradCAM++ & Attn & IG & GradCAM & GradCAM++ & Attn \\
\midrule
\multirow{3}{*}{ViT-Ti/16} & TIN & 1.07 & 1.56 & 1.83 & 1.72 & 1.12 & 2.20 & 2.21 & 1.78 \\
& TRN-5-1/same & 1.16 & 2.00 & 2.17 & 1.61 & 1.31 & 2.76 & 2.37 & 1.62 \\
& TRN-5-1/all & 1.51 & 2.90 & 2.69 & 2.37 & 1.56 & 3.00 & 2.74 & 2.33 \\
\midrule
\multirow{3}{*}{ViT-S/16} & TIN & 1.10 & 1.59 & 1.77 & 1.47 & 1.15 & 2.25 & 2.26 & 1.52 \\
& TRN-5-1/same & 1.18 & 1.67 & 1.69 & 1.38 & 1.40 & 2.68 & 2.46 & 1.49 \\
& TRN-5-1/all & 1.40 & 2.56 & 2.70 & 1.81 & 1.40 & 2.65 & 2.76 & 1.68 \\
\midrule
\multirow{3}{*}{ResNet34} & TIN & 1.40 & 1.89 & 1.71 & & 1.49 & 2.25 & 1.83 \\
& TRN-5-1/same & 1.41 & 1.97 & 1.86 & & 1.70 & 2.52 & 2.07 \\
& TRN-5-1/all & 2.19 & 2.67 & 2.64 & & 1.98 & 2.64 & 2.60 \\
\bottomrule
\end{tabular}
\caption{Relative foreground importance ratio for different models and training datasets. We calculate the per-pixel importance for the foreground label class. Then we aggregate the results for the foreground and background regions. The relative foreground importance is the ratio of the foreground importance divided by the ratio of the foreground size: $\text{Rel fg importance} := \frac{\text{importance in fg region}}{\text{total importance}} / \frac{\text{fg size}}{\text{total image size}}$. We average the results on the whole validation set. Training and evaluation were done without background pruning with range fg insertion mode. Using TinyRecombNet v25-01-24.
\\
\tldr Our dataset/data augmentation improves the focus on the foreground class. Maybe it does not really work with ResNet, because that already has a good focus on the foreground object.}
\end{table*}
\begin{table*}
\centering
\begin{tabular}{llccccccccc}
\toprule
\multirow{2}{*}{Model} & Test Dataset $\rightarrow$ & TIN & \multicolumn{4}{c}{RecombNet-5-1/same/range} & \multicolumn{4}{c}{RecombNet-5-1/all/range} \\
\cmidrule(rl){4-7} \cmidrule(l){8-11}
& Train Dataset $\downarrow$ & & $\eta = 1$ & $\eta = 2$ & $\eta = 3$ & $\eta = 4$ & $\eta = 1$ & $\eta = 2$ & $\eta = 3$ & $\eta = 4$ \\
\midrule
\multirow{6}{*}{ViT-Ti/16} & TIN & 66.7 & 58.7 & 59.6 & 60.1 & 60.2 & 34.3 & 34.4 & 35.3 & 35.4 \\
& TRN-5-1/same $\eta = -3$ & 66.0 \\
& TRN-5-1/same $\eta = -2$ & 66.8 & \\
& TRN-5-1/same $\eta = 1$ & 66.7 & 74.0 & 74.7 & 74.8 & 75.3 & 49.4 & 49.1 & 49.8 & 49.9 \\
& TRN-5-1/same $\eta = 2$ & 66.0 & 74.0 & 74.9 & 74.8 & 74.6 & 49.6 & 50.3 & 50.6 & 51.4 \\
& TRN-5-1/same $\eta = 3$ & 64.5 & 71.8 & 74.0 & 74.3 & 74.3 & 48.0 & 48.9 & 48.7 & 50.1 \\
\midrule
\multirow{6}{*}{ViT-S/16} & TIN & 68.9 & 60.8 & 62.6 & 63.1 & 62.5 & 36.6 & 37.9 & 38.7 & 39.1 \\
& TRN-5-1/same $\eta = -3$ & 71.3 \\
& TRN-5-1/same $\eta = -2$ & 71.5 & \\
& TRN-5-1/same $\eta = 1$ & 72.3 & 80.2 & 79.7 & 80.4 & 80.1 & 56.4 & 57.5 & 57.2 & 57.2 \\
& TRN-5-1/same $\eta = 2$ & 71.3 & 79.1 & 79.6 & 79.9 & 80.1 & 58.2 & 57.3 & 58.2 & 58.2 \\
& TRN-5-1/same $\eta = 3$ & 71.4 & 78.6 & 79.6 & 79.7 & 80.1 & 55.6 & 57.5 & 57.9 & 57.9 \\
\midrule
\multirow{4}{*}{ResNet34} & TIN & 77.9 & 72.2 & 72.7 & 72.7 & 73.2 & 54.2 & 54.6 & 54.2 & 54.7 \\
& TRN-5-1/same $\eta = 1$ & 75.4 & 83.4 & 83.7 & 83.2 & 83.4 & 69.2 & 69.6 & 68.6 & 68.81 \\
& TRN-5-1/same $\eta = 2$ & 76.0 & 83.1 & 83.3 & 83.4 & 83.7 & 68.5 & 69.1 & 69.2 & 69.3 \\
& TRN-5-1/same $\eta = 3$ & 75.8 & 83.1 & 83.1 & 84.0 & 83.5 & 67.3 & 67.8 & 69.2 & 69.0 \\
\bottomrule
\end{tabular}
\caption{Importance of foreground centering via bates distribution. $\eta$ is the parameter of the bates distribution. $\eta = 1$ is the uniform distribution. Training and evaluation were done without background pruning with range fg insertion mode. \\
\tldr Focussing the foreground object in the center makes the task easier (increasing performance left to right), but this then does not aid training (decreasing performance top to bottom) $\Rightarrow \eta = 1$ is the optimum (for training).}
\end{table*}
\begin{table*}
\centering
\begin{tabular}{lccc}
\toprule
\makecell[l]{Augmentation \\ Policy} & Dataset & \multicolumn{2}{c}{\makecell{TinyImageNet \\ Accuracy [\%]}} \\
& & ViT-Ti & ResNet34 \\
\cmidrule(r){1-2} \cmidrule(l){3-4}
\multirow{5}{*}{\makecell{minimal \\ w/o cutmix}} & TinyImageNet & 37.7 & 69.1 \\
& TinyRecombNet-3-2 & 50.7 & 69.1 \\
& TinyRecombNet-3-1 & 51.0 & 69.2 \\
& TinyDoublecombNet-3-2 & 44.9 & 68.6 + \\
& TinyDoublecombNet-3-1 & 46.7 & 68.6 \\
\cmidrule(r){1-2} \cmidrule(l){3-4}
\multirow{3}{*}{\makecell{minimal \\ w/ cutmix}} & TinyImageNet & 57.7 & \textbf{73.0} \\
& TinyRecombNet-3-2 & \textbf{60.6} & 70.3 \\
& TinyRecombNet-3-1 & 59.8 & 70.2 \\
\cmidrule(r){1-2} \cmidrule(l){3-4}
\multirow{3}{*}{\makecell{3-augment \\ w/o cutmix}} & TinyImageNet & 45.3 & 71.6 \\
& TinyRecombNet-3-2 & \textbf{59.9} & \textbf{74.4} \\
& TinyRecombNet-3-1 & 59.8 & 74.3 \\
\cmidrule(r){1-2} \cmidrule(l){3-4}
\multirow{3}{*}{\makecell{3-augment \\ w/ cutmix}} & TinyImageNet & \textbf{65.9} & \textbf{78.0} \\
& TinyRecombNet-3-2 & 65.4 & 76.0 \\
& TinyRecombNet-3-1 & \textbf{65.9} & 75.3 \\
\bottomrule
\end{tabular}
\caption{Training on TinyRecombinationNet (300 ep), evaluating on ImageNet with different data augmentation setups. TinyRecombNet version is v25-01-17-man/auto.}
\end{table*}
\section{Evalutaing on RecombinationNet}
\begin{table*}
\centering
\begin{tabular}{llcccccccc}
\toprule
Model & Eval DS & IN1k Baseline & No pruning & $pt = 1.0$ & $pt = 0.8$ & $pt = 0.6$ & $pt = 0.4$ & $pt = 0.3$ & $pt = 0.2$ \\
\midrule
ViT-B/16 & RN same & 78.0 & 68.8 & 68.9 & 69.4 & 70.2 & 70.7 & 71.2 & 71.2 \\
ViT-B/16 & Backgrounds & & 16.1 & 16.1 & 17.9 & 13.1 & 15.1 \\
\midrule
ResNet-101 & RN same & 79.5 & 72.2 & 72.3 & 73.0 & 73.1 & 73.8 & 73.8 & 74.4 \\
ResNet-101 & Backgrounds & & 14.8 & 14.8 & 16.4 & 11.6 & 13.4 \\
\bottomrule
\end{tabular}
\caption{Training on ImageNet and evaluating with different background prune thresholds. \textbf{RecombNet v25-01-10}.}
\end{table*}
\bibliography{../JabRef/main_bib.bib}
\bibliographystyle{icml2024}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\appendix
\onecolumn
\end{document}

View File

@@ -1,130 +0,0 @@
\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
\usepackage[submission]{aaai2026} % DO NOT CHANGE THIS
\usepackage{times} % DO NOT CHANGE THIS
\usepackage{helvet} % DO NOT CHANGE THIS
\usepackage{courier} % DO NOT CHANGE THIS
\usepackage[hyphens]{url} % DO NOT CHANGE THIS
\usepackage{graphicx} % DO NOT CHANGE THIS
\urlstyle{rm} % DO NOT CHANGE THIS
\def\UrlFont{\rm} % DO NOT CHANGE THIS
\usepackage{natbib} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\frenchspacing % DO NOT CHANGE THIS
\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
%
% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
\usepackage{algorithm}
\usepackage{algorithmic}
%
% These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper.
\usepackage{newfloat}
\usepackage{listings}
\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
\lstset{%
basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace
numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers.
aboveskip=0pt,belowskip=0pt,%
showstringspaces=false,tabsize=2,breaklines=true}
\floatstyle{ruled}
\newfloat{listing}{tb}{lst}{}
\floatname{listing}{Listing}
\input{packages}
%
% Keep the \pdfinfo as shown here. There's no need
% for you to add the /Title and /Author tags.
\pdfinfo{
/TemplateVersion (2026.1)
}
% DISALLOWED PACKAGES
% \usepackage{authblk} -- This package is specifically forbidden
% \usepackage{balance} -- This package is specifically forbidden
% \usepackage{color (if used in text)
% \usepackage{CJK} -- This package is specifically forbidden
% \usepackage{float} -- This package is specifically forbidden
% \usepackage{flushend} -- This package is specifically forbidden
% \usepackage{fontenc} -- This package is specifically forbidden
% \usepackage{fullpage} -- This package is specifically forbidden
% \usepackage{geometry} -- This package is specifically forbidden
% \usepackage{grffile} -- This package is specifically forbidden
% \usepackage{hyperref} -- This package is specifically forbidden
% \usepackage{navigator} -- This package is specifically forbidden
% (or any other package that embeds links such as navigator or hyperref)
% \indentfirst} -- This package is specifically forbidden
% \layout} -- This package is specifically forbidden
% \multicol} -- This package is specifically forbidden
% \nameref} -- This package is specifically forbidden
% \usepackage{savetrees} -- This package is specifically forbidden
% \usepackage{setspace} -- This package is specifically forbidden
% \usepackage{stfloats} -- This package is specifically forbidden
% \usepackage{tabu} -- This package is specifically forbidden
% \usepackage{titlesec} -- This package is specifically forbidden
% \usepackage{tocbibind} -- This package is specifically forbidden
% \usepackage{ulem} -- This package is specifically forbidden
% \usepackage{wrapfig} -- This package is specifically forbidden
% DISALLOWED COMMANDS
% \nocopyright -- Your paper will not be published if you use this command
% \addtolength -- This command may not be used
% \balance -- This command may not be used
% \baselinestretch -- Your paper will not be published if you use this command
% \clearpage -- No page breaks of any kind may be used for the final version of your paper
% \columnsep -- This command may not be used
% \newpage -- No page breaks of any kind may be used for the final version of your paper
% \pagebreak -- No page breaks of any kind may be used for the final version of your paperr
% \pagestyle -- This command may not be used
% \tiny -- This is not an acceptable font size.
% \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference
% \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference
\setcounter{secnumdepth}{2} %May be changed to 1 or 2 if section numbers are desired.
% The file aaai2026.sty is the style file for AAAI Press
% proceedings, working notes, and technical reports.
%
% Title
\newcommand{\name}{\textit{ForNet}\xspace}
\newcommand{\schemename}{\textit{ForAug}\xspace}
% Names: RecombiNet, RecombNet, ReMix, ReMixNet, FoReMix/ForeMix
%%%%%%%%% TITLE - PLEASE UPDATE
\title{\schemename: Recombining Foregrounds and Backgrounds to Improve Vision Transformer Training with Bias Mitigation}
%%%%%%%%% AUTHORS - PLEASE UPDATE
\author {
Tobias Christian Nauen\textsuperscript{\rm 1, \rm 2},
Brian Moser\textsuperscript{\rm 2},
Federico Raue\textsuperscript{\rm 2},
Stanislav Frolov\textsuperscript{\rm 2},
Andreas Dengel\textsuperscript{\rm 1, \rm 2}
}
\affiliations {
\textsuperscript{\rm 1}RPTU Kaiserslautern-Landau, Kaiserslautern, Germany \\
\textsuperscript{\rm 2}German Research Center for Artificial Intelligence (DFKI), Kaiserslautern, Germany \\
{\tt\small first\_second.last@dfki.de / first.last@dfki.de}
}
\begin{document}
\maketitle
\input{sec/abstract}
\input{sec/intro}
\input{sec/related_work}
\input{sec/method}
\input{sec/experiments}
% \input{sec/future_work}
\input{sec/conclusion}
\input{sec/acks}
\bibliography{../JabRef/main_bib}
% \newpage
% \onecolumn
% \appendix
% \input{sec/appendix}
\newpage
\input{sec/reproducability.tex}
\end{document}

View File

@@ -1,51 +0,0 @@
%% This file contains a number of tweaks that are typically applied to the main document.
%% They are not enabled by default, but can be enabled by uncommenting the relevant lines.
%%
%% Inline annotations; for predefined colors, refer to "dvipsnames" in the xcolor package:
%% https://tinyurl.com/overleaf-colors
%%
\newcommand{\red}[1]{{\color{red}#1}}
\newcommand{\todo}[1]{{\color{red}#1}}
\newcommand{\TODO}[1]{\textbf{\color{red}[TODO: #1]}}
%%
%% disable for camera ready / submission by uncommenting these lines
%%
% \renewcommand{\TODO}[1]{}
% \renewcommand{\todo}[1]{#1}
%%
%% work harder in optimizing text layout. Typically shrinks text by 1/6 of page, enable
%% it at the very end of the writing process, when you are just above the page limit
%%
% \usepackage{microtype}
%%
%% fine-tune paragraph spacing
%%
% \renewcommand{\paragraph}[1]{\vspace{.5em}\noindent\textbf{#1.}}
%%
%% globally adjusts space between figure and caption
%%
% \setlength{\abovecaptionskip}{.5em}
%%
%% Allows "the use of \paper to refer to the project name"
%% with automatic management of space at the end of the word
%%
% \usepackage{xspace}
% \newcommand{\paper}{ProjectName\xspace}
%%
%% Commonly used math definitions
%%
% \DeclareMathOperator*{\argmin}{arg\,min}
% \DeclareMathOperator*{\argmax}{arg\,max}
%%
%% Tigthen underline
%%
% \usepackage{soul}
% \setuldepth{foobar}

View File

@@ -1,137 +0,0 @@
\documentclass[10pt,twocolumn,letterpaper]{article}
\usepackage[rebuttal]{cvpr}
% Include other packages here, before hyperref.
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
% Import additional packages in the preamble file, before hyperref
\input{preamble}
% If you comment hyperref and then uncomment it, you should delete
% egpaper.aux before re-running latex. (Or just hit 'q' on the first latex
% run, let it finish, and you should be clear).
\definecolor{cvprblue}{rgb}{0.21,0.49,0.74}
\usepackage[pagebackref,breaklinks,colorlinks,allcolors=cvprblue]{hyperref}
% If you wish to avoid re-using figure, table, and equation numbers from
% the main paper, please uncomment the following and change the numbers
% appropriately.
%\setcounter{figure}{2}
%\setcounter{table}{1}
%\setcounter{equation}{2}
% If you wish to avoid re-using reference numbers from the main paper,
% please uncomment the following and change the counter value to the
% number of references you have in the main paper (here, 100).
%\makeatletter
%\apptocmd{\thebibliography}{\global\c@NAT@ctr 100\relax}{}{}
%\makeatother
%%%%%%%%% PAPER ID - PLEASE UPDATE
\def\paperID{*****} % *** Enter the Paper ID here
\def\confName{CVPR}
\def\confYear{2026}
\begin{document}
%%%%%%%%% TITLE - PLEASE UPDATE
\title{\LaTeX\ Guidelines for Author Response} % **** Enter the paper title here
\maketitle
\thispagestyle{empty}
\appendix
%%%%%%%%% BODY TEXT - ENTER YOUR RESPONSE BELOW
\section{Introduction}
After receiving paper reviews, authors may optionally submit a rebuttal to address the reviewers' comments, which will be limited to a {\bf one page} PDF file.
Please follow the steps and style guidelines outlined below for submitting your author response.
The author rebuttal is optional and, following similar guidelines to previous conferences, is meant to provide you with an opportunity to rebut factual errors or to supply additional information requested by the reviewers.
It is NOT intended to add new contributions (theorems, algorithms, experiments) that were absent in the original submission and NOT specifically requested by the reviewers.
You may optionally add a figure, graph, or proof to your rebuttal to better illustrate your answer to the reviewers' comments.
Per a passed 2018 PAMI-TC motion, reviewers should refrain from requesting significant additional experiments for the rebuttal or penalize for lack of additional experiments.
Authors should refrain from including new experimental results in the rebuttal, especially when not specifically requested to do so by the reviewers.
Authors may include figures with illustrations or comparison tables of results reported in the submission/supplemental material or in other papers.
Just like the original submission, the rebuttal must maintain anonymity and cannot include external links that reveal the author identity or circumvent the length restriction.
The rebuttal must comply with this template (the use of sections is not required, though it is recommended to structure the rebuttal for ease of reading).
%-------------------------------------------------------------------------
\subsection{Response length}
Author responses must be no longer than 1 page in length including any references and figures.
Overlength responses will simply not be reviewed.
This includes responses where the margins and formatting are deemed to have been significantly altered from those laid down by this style guide.
Note that this \LaTeX\ guide already sets figure captions and references in a smaller font.
%------------------------------------------------------------------------
\section{Formatting your Response}
{\bf Make sure to update the paper title and paper ID in the appropriate place in the tex file.}
All text must be in a two-column format.
The total allowable size of the text area is $6\frac78$ inches (17.46 cm) wide by $8\frac78$ inches (22.54 cm) high.
Columns are to be $3\frac14$ inches (8.25 cm) wide, with a $\frac{5}{16}$ inch (0.8 cm) space between them.
The top margin should begin 1 inch (2.54 cm) from the top edge of the page.
The bottom margin should be $1\frac{1}{8}$ inches (2.86 cm) from the bottom edge of the page for $8.5 \times 11$-inch paper;
for A4 paper, approximately $1\frac{5}{8}$ inches (4.13 cm) from the bottom edge of the page.
Please number any displayed equations.
It is important for readers to be able to refer to any particular equation.
Wherever Times is specified, Times Roman may also be used.
Main text should be in 10-point Times, single-spaced.
Section headings should be in 10 or 12 point Times.
All paragraphs should be indented 1 pica (approx.~$\frac{1}{6}$ inch or 0.422 cm).
Figure and table captions should be 9-point Roman type as in \cref{fig:onecol}.
List and number all bibliographical references in 9-point Times, single-spaced,
at the end of your response.
When referenced in the text, enclose the citation number in square brackets, for example~\cite{Alpher05}.
Where appropriate, include the name(s) of editors of referenced books.
\begin{figure}[t]
\centering
\fbox{\rule{0pt}{0.5in} \rule{0.9\linewidth}{0pt}}
%\includegraphics[width=0.8\linewidth]{egfigure.eps}
\caption{Example of caption. It is set in Roman so that mathematics
(always set in Roman: $B \sin A = A \sin B$) may be included without an
ugly clash.}
\label{fig:onecol}
\end{figure}
To avoid ambiguities, it is best if the numbering for equations, figures, tables, and references in the author response does not overlap with that in the main paper (the reviewer may wonder if you talk about \cref{fig:onecol} in the author response or in the paper).
See \LaTeX\ template for a workaround.
%-------------------------------------------------------------------------
\subsection{Illustrations, graphs, and photographs}
All graphics should be centered.
Please ensure that any point you wish to make is resolvable in a printed copy of the response.
Resize fonts in figures to match the font in the body text, and choose line widths which render effectively in print.
Readers (and reviewers), even of an electronic copy, may choose to print your response in order to read it.
You cannot insist that they do otherwise, and therefore must not assume that they can zoom in to see tiny details on a graphic.
When placing figures in \LaTeX, it is almost always best to use \verb+\includegraphics+, and to specify the figure width as a multiple of the line width as in the example below
{\small\begin{verbatim}
\usepackage{graphicx} ...
\includegraphics[width=0.8\linewidth]
{myfile.pdf}
\end{verbatim}
}
%%%%%%%%% REFERENCES
{
\small
\bibliographystyle{ieeenat_fullname}
\bibliography{main}
}
\end{document}

View File

@@ -1,19 +1,27 @@
% !TeX root = ../main.tex
\begin{abstract}
Transformers, particularly Vision Transformers (ViTs), have achieved state-of-the-art performance in large-scale image classification.
However, they often require large amounts of data and can exhibit biases, such as center or size bias, that limit their robustness and generalizability.
This paper introduces \schemename, a novel data augmentation operation that addresses these challenges by explicitly imposing invariances into the training data, which are otherwise part of the neural network architecture.
% This paper introduces \name, a novel dataset derived from ImageNet that addresses these challenges.
\schemename is constructed by using pretrained foundation models to separate and recombine foreground objects with different backgrounds.
% enabling fine-grained control over image composition during training.
% Missing sentence here of how you use it to generate data in what way and with what purpose wrt to bias
This recombination step enables us to take fine-grained control over object position and size, as well as background selection.
% It thus increases the data diversity and effective number of training samples.
We demonstrate that using \schemename significantly improves the accuracy of ViTs and other architectures by up to 4.5 percentage points (p.p.) on ImageNet, which translates to 7.3 p.p. on downstream tasks.
% Importantly, \schemename enables novel ways of analyzing model behavior and quantifying biases.
Importantly, \schemename not only improves accuracy but also opens new ways to analyze model behavior and quantify biases.
Namely, we introduce metrics for background robustness, foreground focus, center bias, and size bias and show that using \schemename during training substantially reduces these biases.
In summary, \schemename provides a valuable tool for analyzing and mitigating biases, enabling the development of more robust and reliable computer vision models.
% Transformers, particularly Vision Transformers (ViTs), have achieved state-of-the-art performance in large-scale image classification.
% However, they often require large amounts of data and can exhibit biases, such as center or size bias, that limit their robustness and generalizability.
% This paper introduces \schemename, a novel data augmentation operation that addresses these challenges by explicitly imposing invariances into the training data, which are otherwise part of the neural network architecture.
% \schemename is constructed by using pretrained foundation models to separate and recombine foreground objects with different backgrounds.
% This recombination step enables us to take fine-grained control over object position and size, as well as background selection.
% We demonstrate that using \schemename significantly improves the accuracy of ViTs and other architectures by up to 4.5 percentage points (p.p.) on ImageNet, which translates to 7.3 p.p. on downstream tasks.
% Importantly, \schemename not only improves accuracy but also opens new ways to analyze model behavior and quantify biases.
% Namely, we introduce metrics for background robustness, foreground focus, center bias, and size bias and show that using \schemename during training substantially reduces these biases.
% In summary, \schemename provides a valuable tool for analyzing and mitigating biases, enabling the development of more robust and reliable computer vision models.
% Our code and dataset are publicly available at \code{<url>}.
Large-scale image classification datasets exhibit strong compositional biases: objects tend to be centered, appear at characteristic scales, and co-occur with class-specific context.
% Models can exploit these biases to achieve high in-distribution accuracy, yet remain brittle under distribution shifts.
By exploiting such biases, models attain high in-distribution accuracy but remain fragile under distribution shifts.
To address this issue, we introduce \schemename, a controlled composition augmentation scheme that factorizes each training image into a \emph{foreground object} and a \emph{background} and recombines them to explicitly manipulate object position, object scale, and background identity.
\schemename uses off-the-shelf segmentation and inpainting models to (i) extract the foreground and synthesize a neutral background, and (ii) paste the foreground onto diverse neutral backgrounds before applying standard strong augmentation policies.
Compared to conventional augmentations and content-mixing methods, our factorization provides direct control knobs that break foreground-background correlations. % while preserving the label.
Across 10 architectures, \schemename improves ImageNet top-1 accuracy by up to 6 percentage points (p.p.) and yields gains of up to 7.3 p.p. on fine-grained downstream datasets.
Moreover, the same control knobs enable targeted diagnostic tests: we quantify background reliance, foreground focus, center bias, and size bias via controlled background swaps and position/scale sweeps, and show that training with \schemename substantially reduces these shortcut behaviors and significantly increases accuracy on standard distribution-shift benchmarks by up to $19$ p.p.
% Moreover, the same control knobs enable targeted diagnostic tests: we quantify background reliance, foreground focus, center bias, and size bias via controlled background swaps and position/scale sweeps, and show that training with \schemename substantially reduces these shortcut behaviors and significantly increases accuracy on standard distribution-shift benchmarks like ImageNet-A/-C/-R by up to $19$ p.p.
Our code and dataset are publicly available at \code{<url>}.
\keywords{Data Augmentation \and Vision Transformer \and Robustness}
\end{abstract}

View File

@@ -1,6 +1,101 @@
% !TeX root = ../supplementary.tex
\section{Training Setup}
\label{sec:training_setup}
\begin{table*}[h!]
\centering
\caption{Training setup and hyperparameters for our ImageNet training.}
\label{tab:in-setup}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccc}
\toprule
Augmentation Pipeline: & Basic & 3-Augment~\cite{Touvron2022} & RandAugment~\cite{Touvron2021b} \\
\midrule
Image Resolution & \multicolumn{3}{c}{$224 \times 224$} \\
Epochs & \multicolumn{3}{c}{300} \\
Learning Rate & S/B: 1e-3, L: 5e-4 & 3e-3 & S/B: 1e-3, L: 5e-4 \\
Learning Rate Schedule & \multicolumn{3}{c}{cosine decay} \\
Batch Size & 1024 & 2048 & 1024 \\
GPUs & \multicolumn{3}{c}{$4\times$ NVIDIA A100/H100/H200} \\
Warmup Schedule & \multicolumn{3}{c}{linear} \\
Warmup Epochs & \multicolumn{3}{c}{3} \\
Weight Decay & 0.05 & 0.02 & 0.05 \\
Label Smoothing & \multicolumn{3}{c}{0.1} \\
Optimizer & AdamW & Lamb \cite{You2020} & AdamW \\
\midrule
Augmentations & \makecell{RandomResizedCrop \\ Horizontal Flip \\ ColorJitter} & \makecell{Resize \\ RandomCrop \\ Horizontal Flip \\ Grayscale \\ Solarize \\ Gaussian-Blur \\ Color Jitter} & \makecell{RandomResizedCrop \\ Horizontal Flip \\ RandomErase \cite{Zhong2020} \\ RandAugment \cite{Cubuk2020} \\ Color Jitter} \\
\bottomrule
\end{tabular}
}
\end{table*}
\begin{table}[h!]
\centering
\caption{Training setup for finetuning on different downstream datasets. Other settings are the same as in \Cref{tab:in-setup}. For finetuning, we always utilize 3-Augment and the related parameters from the \emph{ViT, Swin, ResNet} column of \Cref{tab:in-setup}}
\label{tab:downstream-setup}
\begin{tabular}{lcccc}
\toprule
Dataset & Batch Size & Epochs & Learning Rate & Num. GPUs \\
\midrule
Aircraft & 512 & 500 & 3e-4 & 2 \\
Cars & 1024 & 500 & 3e-4 & 4 \\
Flowers & 256 & 500 & 3e-4 & 1 \\
Food & 2048 & 100 & 3e-4 & 4 \\
Pets & 512 & 500 & 3e-4 & 2 \\
\bottomrule
\end{tabular}
\end{table}
On ImageNet, we test three different data augmentation pipelines and hyperparameter settings as shown in \Cref{tab:in-setup}: A basic pipeline, a pipeline using RandAugment based on the DeiT~\cite{Touvron2021b} setup and 3-Augment, as used in \cite{Touvron2022,Nauen2025}.
When comparing different architectures, ViT, Swin, and ResNet are trained with the 3-Augment pipeline and DeiT is trained with the RandAugment pipeline.
% On ImageNet we use the same training setup as \cite{Nauen2025} and \cite{Touvron2022} without pretraining for ViT, Swin, and ResNet.
% For DeiT, we train the same ViT architecture but using the data augmentation scheme and hyperparameters from \cite{Touvron2021b}.
As our focus is on evaluating the changes in accuracy due to \schemename, like \cite{Nauen2025}, we stick to one set of hyperparameters for all models.
We list the settings used for training on ImageNet in \Cref{tab:in-setup} and the ones used for finetuning those weights on the downstream datasets in \Cref{tab:downstream-setup}.
Our implementation is using PyTorch \cite{Paszke2019} and the \emph{timm} library \cite{Wightman2019} for model architectures and basic functions.
\begin{table*}[ht!]
\centering
\caption{Hardware and Software specifics used for both training and evaluation.}
\label{tab:hw-sw-versions}
\begin{tabular}{ll}
\toprule
Parameter & Value \\
\midrule
GPU & $4 \times$ NVIDIA A100/H100/H200 \\
CPU & 24 CPU cores (Intel Xenon) per GPU \\
Memory & up to 120 GB per GPU \\
Operating System & Enroot container for SLURM based on Ubuntu 24.04 LTS \\
Python & 3.12.3 \\
PyTorch & 2.7.0 \\
TorchVision & 0.22.0 \\
Timm & 1.0.15 \\
\bottomrule
\end{tabular}
\end{table*}
\Cref{tab:hw-sw-versions} lists the specific hardware we use, as well as versions of the relevant software packages.
\section{Resource Usage of \schemename}
To utilize the proposed \schemename, specific computational resources are necessary, particularly for computing and storing for the output of the segmentation stage and for on-the-fly processing of the recombination stage.
\paragraph{Segmentation.}
% While calculating the segmentations and infills takes a lot of compute, this is effort that has to be spent only once per dataset.
\schemename involves a computationally expensive segmentation and infill stage, which is a one-time calculation per dataset.
Once computed, the segmentation and infill results can be perpetually reused, amortizing the initial cost over all subsequent experiments and applications.
On NVIDIA H100 GPUs, the segmentation stage will compute at a rate of $374.3 \frac{\text{img}}{\text{GPU} \times \text{h}}$ when using Attentive Eraser or $5 338.6 \frac{\text{img}}{\text{GPU} \times \text{h}}$ for LaMa.
For ImageNet this comes down to just under 9 days (Attentive Eraser) or 16 hours (LaMa) on two 8 GPU nodes.
To facilitate immediate use and reproduction of results, we publicly provide the precalculated segmentation stage output for the ImageNet dataset for download\footnote{Link will go here.}.
The output of \schemename's segmentation step on ImageNet dataset requires 73 GB of additional disk space for the segmentation output, which is separate from the base 147 GB ImageNet size.
\paragraph{Recombination.}
The recombination step of \schemename is implemented as a based data loader operation.
It's thus offloaded to the CPU, where it can be heavily parallelized and thus only results in a very minor increase in the training step-time.
For example, using a ViT-B model on an NVIDIA A100 GPU, the average update step-time increased by $1\%$, from $528 \pm 2$ ms to $534 \pm 1$ ms.
\section{Extended Bates Distribution}
\label{apdx:bates-distribution}
\begin{figure}[h!]
\centering
\includegraphics[width=.5\columnwidth]{img/bates.pdf}
@@ -8,27 +103,6 @@
\label{fig:bates-pdf}
\end{figure}
% Finally, we analyze the foreground object's positioning in the image.
% We utilize an extended Bates distribution to sample the position of the foreground object.
% The Bates distribution~\cite{Bates1955} with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}.
% Therefore, the larger $\eta$, the more concentrated the distribution is around the center.
% We extend this concept to $\eta \leq -1$ by shifting the distribution away from the center and towards the edges.
% We extend this concept to $\eta \leq -1$ by defining
% \begin{align*}
% X \sim \text{Bates}(\eta) :\Leftrightarrow s(X) \sim \text{Bates}(-\eta)
% \end{align*}
% for $\eta \leq 1$ with $s$ being the sawtooth function on $[0, 1]$:
% \begin{align}
% s(x) = \begin{cases}
% x + 0.5 & \text{if } 0 < x < 0.5 \\
% x - 0.5 & \text{if } 0.5 \leq x \leq 1
% \end{cases}
% \end{align}
% Note that $s \circ s = \id$ on $[0, 1]$.
% This way, distributions with $\eta \leq -1$ are more concentrated around the borders.
% $\eta = 1$ and $\eta = -1$ both correspond to the uniform distribution.
% The PDF of this extended Bates distribution is visualized in \Cref{fig:bates-pdf}.
We introduce an extension of the Bates distribution~\cite{Bates1955} to include negative parameters, enabling sampling of foreground object positions away from the image center.
The standard Bates distribution, for $\eta \in \N$, is defined as the mean of $\eta$ independent random variables drawn from a uniform distribution \cite{Jonhson1995}.
A larger $\eta$ value increases the concentration of samples around the distribution's mean, which in this case is the image center.
@@ -51,98 +125,304 @@ This transformation inverts the distribution's concentration, shifting the proba
We visualize the distribution function of the extended Bates distribution in \Cref{fig:bates-pdf}.
Both $\eta = 1$ and $\eta = -1$ result in a uniform distribution across the image.
\section{Resource Usage of \schemename}
To utilize the proposed \schemename, specific computational resources are necessary, particularly for computing and storing for the output of the segmentation stage and for on-the-fly processing of the recombination stage.
\section{Design Choices of \schemename}
\label{sec:ablation}
\paragraph{Segmentation.}
% While calculating the segmentations and infills takes a lot of compute, this is effort that has to be spent only once per dataset.
\schemename involves a computationally expensive segmentation and infill stage, which is a one-time calculation per dataset.
Once computed, the segmentation and infill results can be perpetually reused, amortizing the initial cost over all subsequent experiments and applications.
On NVIDIA H100 GPUs, the segmentation stage will compute at a rate of $374.3 \frac{\text{img}}{\text{GPU} \times \text{h}}$ when using Attentive Eraser or $5 338.6 \frac{\text{img}}{\text{GPU} \times \text{h}}$ for LaMa.
For ImageNet this comes down to just under 9 days (Attentive Eraser) or 16 hours (LaMa) on two 8 GPU nodes.
To facilitate immediate use and reproduction of results, we publicly provide the precalculated segmentation stage output for the ImageNet dataset for download\footnote{Link will go here.}.
The output of \schemename's segmentation step on ImageNet dataset requires 73 GB of additional disk space for the segmentation output, which is separate from the base 147 GB ImageNet size.
We start by ablating the design choices of \schemename on TinyImageNet~\cite{Le2015}, a subset of ImageNet containing 200 categories with 500 images each. %, and Tiny\name, the application of \schemename to TinyImageNet.
% \Cref{tab:ablation} presents the results of these ablations.
\Cref{tab:ablation-segment} presents ablations for segmentation and \Cref{tab:ablation-recombine} for recombination.
\paragraph{Recombination.}
The recombination step of \schemename is implemented as a based data loader operation.
It's thus offloaded to the CPU, where it can be heavily parallelized and thus only results in a very minor increase in the training step-time.
For example, using a ViT-B model on an NVIDIA A100 GPU, the average update step-time increased by $1\%$, from $528 \pm 2$ ms to $534 \pm 1$ ms.
\section{Training Setup}
\label{sec:training_setup}
\begin{table*}[h!]
\begin{table}
\caption{Ablation of the design decisions in the segmentation phase of \schemename on TinyImageNet.
The first line is our baseline, while the other lines are using \schemename.
We use basic settings with the \emph{same} background strategy during recombination for this experiment.
}
\label{tab:ablation-segment}
\centering
\caption{Training setup and hyperparameters for our ImageNet training.}
\label{tab:in-setup}
\begin{tabular}{lcc}
\small
% \resizebox{.9\columnwidth}{!}{
\begin{tabular}{llcc}
\toprule
Parameter & ViT, Swin, ResNet & DeiT \\
\multirow{2.5}{*}{\makecell{Detect. \\Prompt}} & \multirow{2.5}{*}{\makecell{Infill \\ Model}} & \multicolumn{2}{c}{TinyImageNet Accuracy [\%]} \\
\cmidrule{3-4}
& & ViT-Ti & ViT-S \\
\midrule
Image Resolution & $224 \times 224$ & $224 \times 224$ \\
Epochs & 300 & 300 \\
Learning Rate & 3e-3 & S/B: 1e-3, L: 5e-4 \\
Learning Rate Schedule & cosine decay & cosine decay \\
Batch Size & 2048 & 1024 \\
GPUs & $4\times$ NVIDIA A100/H100/H200 & $4\times$ NVIDIA A100/H100/H200 \\
Warmup Schedule & linear & linear \\
Warmup Epochs & 3 & 3 \\
Weight Decay & 0.02 & 0.05 \\
Label Smoothing & 0.1 & 0.1 \\
Optimizer & Lamb \cite{You2020} & AdamW \\
\cmidrule(r){1-1}
Data Augmentation Policy & \textbf{3-Augment \cite{Touvron2022}} & \textbf{DeiT \cite{Touvron2021b}} \\
Augmentations & \makecell{Resize \\ RandomCrop \\ HorizontalFlip \\ Grayscale \\ Solarize \\ GaussianBlur \\ ColorJitter \\ CutMix \cite{Yun2019}} & \makecell{RandomResizedCrop \\ HorizontalFlip \\ RandomErase \cite{Zhong2017} \\ RandAugment \cite{Cubuk2019} \\ ColorJitter \\ Mixup \cite{Zhang2018a} \\ CutMix \cite{Yun2019}} \\
\bottomrule
\end{tabular}
\end{table*}
\begin{table}[h!]
\centering
\caption{Training setup for finetuning on different downstream datasets. Other settings are the same as in \Cref{tab:in-setup}. For finetuning, we always utilize 3-Augment and the related parameters from the \emph{ViT, Swin, ResNet} column of \Cref{tab:in-setup}}
\label{tab:downstream-setup}
\begin{tabular}{lcccc}
\toprule
Dataset & Batch Size & Epochs & Learning Rate & Num. GPUs \\
\midrule
Aircraft & 512 & 500 & 3e-4 & 2 \\
Cars & 1024 & 500 & 3e-4 & 4 \\
Flowers & 256 & 500 & 3e-4 & 1 \\
Food & 2048 & 100 & 3e-4 & 4 \\
Pets & 512 & 500 & 3e-4 & 2 \\
\multicolumn{2}{l}{\textbf{TinyImageNet}} & $66.1 \pm 0.5$ & $68.3 \pm 0.7$ \\
specific & LaMa \cite{Suvorov2022} & $65.5 \pm 0.4$ & $71.2 \pm 0.5$ \\
general & \gtxt{LaMa \cite{Suvorov2022}} & $66.4 \pm 0.6$ & $72.9 \pm 0.6$ \\
\gtxt{general} & Att. Eraser \cite{Sun2025} & $67.5 \pm 1.2$ & $72.4 \pm 0.5$ \\
\bottomrule
\end{tabular}
% }
\end{table}
On ImageNet we use the same training setup as \cite{Nauen2025} and \cite{Touvron2022} without pretraining for ViT, Swin, and ResNet.
For DeiT, we train the same ViT architecture but using the data augmentation scheme and hyperparameters from \cite{Touvron2021b}.
As our focus is on evaluating the changes in accuracy due to \schemename, like \cite{Nauen2025}, we stick to one set of hyperparameters for all models.
We list the settings used for training on ImageNet in \Cref{tab:in-setup} and the ones used for finetuning those weights on the downstream datasets in \Cref{tab:downstream-setup}.
Out implementation is using PyTorch \cite{Paszke2019} and the \emph{timm} library \cite{Wightman2019} for model architectures and basic functions.
\begin{table*}[h!]
\begin{table}[t]
\caption{Ablation of the recombination phase of \schemename on TinyImageNet (top) and ImageNet (bottom). The first experiments use the initial segmentation settings with LaMa \cite{Suvorov2022}.}
\label{tab:ablation-recombine}
\centering
\caption{Hardware and Software specifics used for both training and evaluation.}
\label{tab:hw-sw-versions}
\begin{tabular}{ll}
% \resizebox{.9\columnwidth}{!}{
\begin{tabular}{ccccccccccc}
\toprule
Parameter & Value \\
% FG. & Augment. & BG. & BG. & Edge & Original & \multicolumn{2}{c}{Accuracy [\%]} \\
% Size & Order & Strat. & Prune & Smoothing & Mixing & ViT-Ti & ViT-S \\
\multirow{2.5}{*}{\makecell{FG. \\size}} & \multirow{2.5}{*}{\makecell{Augment.\\Order}} & \multirow{2.5}{*}{\makecell{BG\\Strat.}} & \multirow{2.5}{*}{\makecell{BG.\\Prune}} & \multirow{2.5}{*}{\makecell{Original\\Mixing}} & \multirow{2.5}{*}{\makecell{Edge\\Smooth.}} & \multicolumn{2}{c}{Accuracy [\%]} \\
\cmidrule{7-8}
& & & & & & ViT-Ti & ViT-S \\
\midrule
GPU & NVIDIA A100/H100/H200 \\
CPU & 24 CPU cores (Intex Xenon) per GPU \\
Memory & up to 120GB per GPU \\
Operating System & Enroot container for SLURM based on Ubuntu 24.04 LTS \\
Python & 3.12.3 \\
PyTorch & 2.7.0 \\
TorchVision & 0.22.0 \\
Timm & 1.0.15 \\
% TinyImageNet & & & & & & & $66.1\pm0.5$ & $68.3\pm0.7$ \\
\multicolumn{6}{l}{\textbf{TinyImageNet}} & \gtxt{$66.1\pm0.5$} & \gtxt{$68.3\pm0.7$} \\
mean & crop$\to$paste & same & - & - & \gtxt{-} & $64.6\pm0.5$ & $70.0\pm0.6$ \\
range & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $65.5\pm0.4$ & $71.2\pm0.5$ \\
\midrule
% \gtxt{range} & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $66.4\pm0.6$ & $72.9\pm0.6$ \\
{range} & {crop$\to$paste} & {same} & {-} & {-} & {-} & $67.5\pm1.2$ & $72.4\pm0.5$ \\
\gtxt{range} & paste$\to$crop & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.1\pm1.2$ & $72.9\pm0.5$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 1.0 & \gtxt{-} & \gtxt{-} & $67.0\pm1.2$ & $73.0\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.8 & \gtxt{-} & \gtxt{-} & $67.2\pm1.2$ & $72.9\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.6 & \gtxt{-} & \gtxt{-} & $67.5\pm1.0$ & $72.8\pm0.7$ \\
% \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 2.0$ & \gtxt{-} & $67.2\pm0.4$ & $72.9\pm0.5$ \\
% \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{-} & $65.9\pm0.5$ & $72.4\pm0.6$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.2$ & \gtxt{-} & $69.8\pm0.5$ & $75.0\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.33$ & \gtxt{-} & $69.5\pm0.4$ & $75.2\pm1.0$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.5$ & \gtxt{-} & $70.3\pm1.0$ & $74.2\pm0.2$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & linear & \gtxt{-} & $70.1\pm0.7$ & $74.9\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & reverse lin. & \gtxt{-} & $67.6\pm0.2$ & $73.2\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & cos & \gtxt{-} & $71.3\pm1.0$ & $75.7\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & $70.0\pm0.8$ & $75.5\pm0.7$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & orig. & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $67.2\pm0.9$ & $69.9\pm1.0$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $70.1\pm0.7$ & $77.5\pm0.6$ \\
\midrule
\multicolumn{6}{l}{\textbf{ImageNet}} & \gtxt{-} & \gtxt{$79.1\pm0.1$} \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & \gtxt{-} & - & $80.5\pm0.1$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & - & $80.7\pm0.1$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & - & $81.4\pm0.1$ \\
\bottomrule
\end{tabular}
\end{table*}
\Cref{tab:hw-sw-versions} lists the specific hardware we use, as well as versions of the relevant software packages.
% }
\end{table}
\textbf{Prompt.}
% We present the ablation of our main design decisions in \Cref{tab:ablation}.
First, we evaluate the type of prompt used to detect the foreground object.
Here, the \emph{general} prompt, which contains the class and the more general object category, outperforms only having the class name (\emph{specific}).
\textbf{Inpainting.} Among inpainting models, Attentive Eraser~\cite{Sun2025} produces slightly better results compared to LaMa~\cite{Suvorov2022} ($+0.5$ p.p. on average).
For inpainting examples, see the supplementary material.
% (see the supplementary material for examples).
% When comparing the infill models, the GAN-based LaMa \cite{Suvorov2022} gets outperformed by the Attentive Eraser \cite{Sun2025}.
\textbf{Foreground size}
% We observe that LaMa's often infills unnatural textures compared to Attentive Eraser.
% The size of foreground objects during training has a significant impact on the performance.
% Here, using the greater variability of the \emph{range} strategy increases the performance by $\approx 1\%$ compared to the \emph{mean} strategy.
significantly impacts performance.
Employing a \emph{range} of sizes during recombination, rather than a fixed \emph{mean} size, boosts accuracy by approximately 1 p.p.
This suggests that the added variability is beneficial.
\textbf{Order of data augmentation.}
% (1) Applying the image crop related augmentations \emph{before} pasting the foreground object and the color-based ones \emph{after} pasting or (2) applying all data augmentations after pasting the foreground object.
% While results are ambiguous, we choose the second strategy, as it improves the performance of ViT-S, although not the one of ViT-Ti.
Applying all augmentations after foreground-background recombination (\emph{paste$\to$crop$\to$color}) improves ViT-S's performance compared to applying crop-related augmentations before pasting (\emph{crop$\to$paste$\to$color}).
ViT-Ti results are ambiguous.
\textbf{Background pruning.}
When it comes to the backgrounds to use, we test different pruning thresholds ($t_\text{prune}$) to exclude backgrounds with large inpainting.
% and only use backgrounds with an relative size of the infilled region of at most $t_\text{prune}$ (exclusive).
A threshold of $t_\text{prune}=1.0$ means that we use all backgrounds that are not fully infilled.
% We find that the background pruning does not significantly impact the models' performance.
% We choose $t_\text{prune}=0.8$ for the following experiments to exclude backgrounds that are mostly artificial.
Varying $t_\text{prune}$ has minimal impact.
We choose $t_\text{prune} = 0.8$ to exclude predominantly artificial backgrounds.
% One of the most important design decisions is the mixing of the original dataset with \name.
\textbf{Mixing} \schemename-augmented samples with the original ImageNet data proves crucial.
While constant and linear mixing schedules improve performance over no mixing by $2-3$ p.p. compared to only augmented samples, the cosine annealing schedule proves optimal, boosting accuracy by $3-4$ p.p.
\textbf{Edge smoothing.}
We evaluate the impact of using Gaussian blurring to smooth the edges of the foreground masks.
% Similarly, applying edge smoothing to foreground masks with Gaussian blurring actually hurts performance on Tiny\name, but slightly improves it on \name.
For larger models, this gives us a slight performance boost on the full ImageNet (second to last line in \Cref{tab:ablation-recombine}).
\textbf{Background strategy.}
Another point is the allowed choice of background image for each foreground object.
% We evaluate three different strategies.
% (1) Picking the background from which that specific foreground was originally extracted.
% The major difference to ImageNet when using this setup is the variability in size and position of the foreground object.
% (2) Picking a background that originally had a foreground object of the same class in it.
% Here, we have backgrounds where objects of this type can typically appear while also creating a wider variety of samples due to pairing each foreground object with different backgrounds each time.
% (3) Picking any background.
% This choice has the largest variety of backgrounds, but the backgrounds are not semantically related to the foreground object anymore.
% We find in \Cref{fig:bg-strategy} that choosing only a foreground's original background is the worst choice.
We compare using the original background, a background from the same class, and any background.
These strategies go from low diversity and high shared information content between the foreground and background to high diversity and low shared information content.
For \emph{ViT-Ti}, the latter two strategies perform comparably, while \emph{ViT-S} benefits from the added diversity of using any background.
The same is true when training on the full ImageNet.
\begin{table}
\caption{Accuracy of ViT-S on TinyImageNet (TIN) in percent using \schemename with different foreground position distributions by varying the Bates parameter $\eta$.
The best performance is achieved when using the uniform distribution ($\eta=1$) for training.}
\label{tbl:foreground-eta}
\centering
\small
% \resizebox{.9\columnwidth}{!}{
\begin{tabular}{ccccccc}
\toprule
\multirow{2.5}{*}{\makecell{Bates Parameter \\during training}} & \multirow{2.5}{*}{\makecell{TIN \\w/o \schemename}} & \multicolumn{5}{c}{TIN w/ \schemename} \\
\cmidrule(l){3-7}
& & $\eta=-3$ & $-2$ & $1/-1$ & $2$ & $3$ \\
\midrule
Baseline & 68.9 & 60.5 & 60.2 & 60.8 & 62.6 & 63.1 \\
$\eta=-3$ & 71.3 & 79.3 & 79.5 & 79.1 & 79.3 & 79.1 \\
$\eta=-2$ & 71.5 & 80.0 & 78.7 & 79.3 & 79.1 & 78.8 \\
$\eta=1/-1$ & 72.3 & 79.5 & 78.9 & 80.2 & 79.7 & 80.4 \\
$\eta=2$ & 71.3 & 78.2 & 77.8 & 79.1 & 79.6 & 79.9 \\
$\eta=3$ & 71.4 & 77.2 & 76.9 & 78.6 & 79.6 & 79.7 \\
\bottomrule
\end{tabular}
% }
\end{table}
\textbf{Foreground position.}
Finally, we analyze the foreground object's positioning in the image, using a
generalization of the Bates distribution~\cite{Bates1955} with parameter $\eta \in \Z$ (see \Cref{apdx:bates-distribution}).
The Bates distribution presents an easy way to sample from a bounded domain with just one hyperparameter that controls its concentration.
$\eta = 1/-1$ corresponds to the uniform distribution; $\eta > 1$ concentrates the distribution around the center; and for $\eta < -1$, the distribution is concentrated at the borders (see supplementary material for details).
% We utilize an extended Bates distribution to sample the position of the foreground object.
% The Bates distribution with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}.
% The larger $\eta$, the more concentrated the distribution is at the center, $\eta < -1$ concentrates the distribution at the edges.
% We extend this concept to $\eta \leq -1$, shifting the distribution away from the center and towards the edges.
When sampling more towards the center of the image, the difficulty of the task is reduced, which reduces performance on TinyImageNet (\Cref{tbl:foreground-eta}).
This is reflected in the performance when evaluating using \schemename with $\eta=2$ and $\eta=3$ compared to $\eta=-1/1$.
We observe a similar reduction for $\eta < -1$.
% This experiment is conducted using the LaMa infill model.
\begin{table}[t]
\caption{Dataset statistics for TinyImageNet and ImageNet with and without \schemename. For \schemename we report the number of foreground/background pairs.}
\label{tab:dataset-stats}
\centering
% \resizebox{.5\columnwidth}{!}{
\begin{tabular}{l S[table-format=4.0] S[table-format=7.0] S[table-format=5.0]}
\toprule
Dataset & {Classes} & {\makecell{Training \\ Images}} & {\makecell{Validation \\ Images}} \\
\midrule
TinyImageNet & 200 & 100000 & 10000 \\
TinyImageNet + \schemename & 200 & 99404 & 9915 \\
ImageNet & 1000 & 1281167 & 50000 \\
ImageNet + \schemename & 1000 & 1274557 & 49751 \\
\bottomrule
\end{tabular}
% }
\end{table}
After fixing the optimal design parameters in \Cref{tab:ablation-segment,tab:ablation-recombine} (last rows), we run \schemename's segmentation step on the entire ImageNet dataset.
\Cref{tab:dataset-stats} shows the resulting dataset statistics.
% The slightly lower number of images in \name is due to \emph{Grounded SAM} returning no or invalid detections for some images.
The slightly reduced image count for \schemename is due to instances where Grounded SAM fails to produce valid segmentation masks.
\section{Robustness Evaluation on Corner-Cases}
\begin{table}[t]
\centering
\caption{Evaluation on the Corner-Cases dataset. Objects cut from ImageNet evaluation bounding boxes are pasted onto infilled backgrounds. Objects have three sizes: $56$px, $84$px, and $112$px. Objects are places in the center (CeX) or corner (CoX) of an image its original background (XxO) or a random background (XxR).}
\label{tab:corner-cases}
\resizebox{\textwidth}{!}{
\begin{tabular}{lcccccccccccccc}
\toprule
\multirow{4}{*}{Model} & \multirow{4}{*}{w/ \schemename} & \multicolumn{12}{c}{Corner Cases Accuracy [\%]} \\
\cmidrule(l){3-14}
& & \multicolumn{4}{c}{56} & \multicolumn{4}{c}{84} & \multicolumn{4}{c}{112} \\
\cmidrule(lr){3-6} \cmidrule(lr){7-10} \cmidrule(l){11-14}
& & CeO & CoO & CeR & CoR & CeO & CoO & CeR & CoR & CeO & CoO & CeR & CoR \\
\midrule
ViT-S & \xmark & $40.5 \pm 2.0$ & $28.6 \pm 0.8$ & $10.3 \pm 0.9$ & $6.4 \pm 0.2$ & $56.8 \pm 1.2$ & $47.6 \pm 1.0$ & $31.3 \pm 0.7$ & $25.5 \pm 0.5$ & $70.9 \pm 0.1$ & $66.9 \pm 1.6$ & $55.2 \pm 0.2$ & $51.1 \pm 0.8$ \\
ViT-S & \cmark & $49.4 \pm 0.6$ & $39.9 \pm 0.5$ & $22.7 \pm 0.4$ & $17.6 \pm 0.3$ & $66.3 \pm 0.3$ & $60.0 \pm 0.3$ & $47.7 \pm 0.7$ & $43.2 \pm 0.2$ & $76.5 \pm 0.2$ & $74.9 \pm 0.4$ & $66.8 \pm 0.6$ & $64.9 \pm 0.1$ \\
& & \grntxt{$+8.9$} & \grntxt{$+11.3$} & \grntxt{$+12.4$} & \grntxt{$+11.2$} & \grntxt{$+9.4$} & \grntxt{$+12.4$} & \grntxt{$+16.4$} & \grntxt{$+17.7$} & \grntxt{$+5.6$} & \grntxt{$+8.0$} & \grntxt{$+11.6$} & \grntxt{$+13.7$} \\
\cmidrule(r){1-2}
ViT-B & \xmark & $37.9 \pm 1.4$ & $29.3 \pm 0.7$ & $14.0 \pm 1.7$ & $11.9 \pm 1.1$ & $51.5 \pm 0.7$ & $45.0 \pm 0.8$ & $27.3 \pm 0.8$ & $26.3 \pm 0.8$ & $64.7 \pm 0.3$ & $61.8 \pm 0.6$ & $46.3 \pm 0.3$ & $45.5 \pm 0.5$ \\
ViT-B & \cmark & $50.4 \pm 0.8$ & $42.4 \pm 0.6$ & $26.5 \pm 0.6$ & $22.8 \pm 0.8$ & $65.3 \pm 0.9$ & $60.9 \pm 0.6$ & $47.6 \pm 0.3$ & $45.6 \pm 0.1$ & $75.7 \pm 0.6$ & $74.0 \pm 0.6$ & $65.7 \pm 0.7$ & $64.3 \pm 0.5$ \\
& & \grntxt{$+12.5$} & \grntxt{$+13.1$} & \grntxt{$+12.4$} & \grntxt{$+10.9$} & \grntxt{$+13.8$} & \grntxt{$+15.9$} & \grntxt{$+20.2$} & \grntxt{$+19.3$} & \grntxt{$+11.0$} & \grntxt{$+12.2$} & \grntxt{$+19.3$} & \grntxt{$+18.8$} \\
\cmidrule(r){1-2}
ViT-L & \xmark & $32.8 \pm 1.6$ & $24.8 \pm 1.1$ & $14.8 \pm 2.2$ & $9.7 \pm 1.2$ & $42.7 \pm 0.9$ & $33.8 \pm 0.7$ & $21.3 \pm 1.5$ & $16.3 \pm 1.0$ & $55.7 \pm 0.7$ & $49.7 \pm 0.7$ & $36.0 \pm 1.3$ & $32.5 \pm 0.9$ \\
ViT-L & \cmark & $45.7 \pm 0.6$ & $39.0 \pm 0.5$ & $25.6 \pm 0.6$ & $24.1 \pm 0.8$ & $59.1 \pm 0.3$ & $55.2 \pm 0.4$ & $41.9 \pm 1.0$ & $42.7 \pm 0.6$ & $71.4 \pm 0.3$ & $69.0 \pm 0.4$ & $60.7 \pm 1.0$ & $60.3 \pm 0.8$ \\
& & \grntxt{$+12.9$} & \grntxt{$+14.2$} & \grntxt{$+10.8$} & \grntxt{$+14.4$} & \grntxt{$+16.3$} & \grntxt{$+21.5$} & \grntxt{$+20.5$} & \grntxt{$+26.4$} & \grntxt{$+15.7$} & \grntxt{$+19.3$} & \grntxt{$+24.7$} & \grntxt{$+27.8$} \\
\cmidrule(r){1-2}
DeiT-S & \xmark & $46.3 \pm 0.7$ & $38.1 \pm 0.3$ & $13.1 \pm 0.5$ & $9.9 \pm 0.1$ & $62.8 \pm 0.4$ & $58.2 \pm 0.2$ & $37.1 \pm 0.7$ & $34.3 \pm 0.5$ & $73.3 \pm 0.2$ & $73.9 \pm 0.4$ & $58.8 \pm 0.4$ & $59.4 \pm 0.6$ \\
DeiT-S & \cmark & $44.7 \pm 1.4$ & $37.1 \pm 1.4$ & $15.6 \pm 1.3$ & $12.1 \pm 0.9$ & $62.1 \pm 1.2$ & $57.8 \pm 1.1$ & $41.6 \pm 1.1$ & $37.9 \pm 1.2$ & $73.2 \pm 0.7$ & $73.3 \pm 0.4$ & $62.3 \pm 0.7$ & $61.4 \pm 0.9$ \\
& & \rdtxt{$-1.6$} & \rdtxt{$-1.1$} & \grntxt{$+2.4$} & \grntxt{$+2.2$} & \rdtxt{$-0.7$} & \rdtxt{$-0.4$} & \grntxt{$+4.4$} & \grntxt{$+3.5$} & \gtxt{$-0.1$} & \rdtxt{$-0.6$} & \grntxt{$+3.5$} & \grntxt{$+2.0$} \\
\cmidrule(r){1-2}
DeiT-B & \xmark & $48.1 \pm 0.9$ & $40.4 \pm 2.0$ & $15.8 \pm 0.2$ & $12.9 \pm 0.6$ & $64.0 \pm 0.9$ & $59.5 \pm 1.3$ & $39.0 \pm 0.9$ & $37.2 \pm 0.8$ & $74.1 \pm 0.7$ & $74.8 \pm 0.7$ & $59.1 \pm 0.8$ & $60.0 \pm 0.6$ \\
DeiT-B & \cmark & $50.7 \pm 0.1$ & $44.0 \pm 0.4$ & $19.3 \pm 0.2$ & $16.3 \pm 0.2$ & $66.0 \pm 0.2$ & $62.0 \pm 0.3$ & $43.4 \pm 0.3$ & $40.9 \pm 0.4$ & $75.4 \pm 0.1$ & $76.4 \pm 0.3$ & $62.8 \pm 0.2$ & $63.9 \pm 0.2$ \\
& & \grntxt{$+2.6$} & \grntxt{$+3.6$} & \grntxt{$+3.5$} & \grntxt{$+3.5$} & \grntxt{$+2.0$} & \grntxt{$+2.5$} & \grntxt{$+4.4$} & \grntxt{$+3.8$} & \grntxt{$+1.3$} & \grntxt{$+1.6$} & \grntxt{$+3.8$} & \grntxt{$+3.9$} \\
\cmidrule(r){1-2}
DeiT-L & \xmark & $39.2 \pm 2.6$ & $32.6 \pm 1.5$ & $10.5 \pm 2.8$ & $9.1 \pm 2.3$ & $55.7 \pm 2.5$ & $51.0 \pm 2.7$ & $30.3 \pm 4.0$ & $29.5 \pm 3.9$ & $68.5 \pm 2.1$ & $68.1 \pm 1.7$ & $51.7 \pm 3.1$ & $52.1 \pm 2.7$ \\
DeiT-L & \cmark & $51.9 \pm 0.7$ & $46.6 \pm 0.5$ & $21.5 \pm 1.3$ & $19.0 \pm 1.2$ & $66.6 \pm 0.6$ & $64.1 \pm 0.7$ & $45.3 \pm 1.3$ & $43.6 \pm 1.1$ & $75.6 \pm 0.4$ & $77.3 \pm 0.4$ & $63.8 \pm 0.8$ & $65.4 \pm 0.6$ \\
& & \grntxt{$+12.8$} & \grntxt{$+14.0$} & \grntxt{$+11.0$} & \grntxt{$+9.9$} & \grntxt{$+11.0$} & \grntxt{$+13.1$} & \grntxt{$+15.0$} & \grntxt{$+14.1$} & \grntxt{$+7.1$} & \grntxt{$+9.2$} & \grntxt{$+12.1$} & \grntxt{$+13.4$} \\
\cmidrule(r){1-2}
Swin-Ti & \xmark & $41.2 \pm 1.8$ & $32.5 \pm 0.3$ & $17.4 \pm 2.6$ & $12.2 \pm 0.2$ & $60.0 \pm 1.6$ & $51.4 \pm 0.2$ & $39.6 \pm 2.6$ & $34.8 \pm 0.9$ & $71.7 \pm 0.8$ & $66.1 \pm 0.7$ & $58.2 \pm 1.1$ & $53.6 \pm 1.2$ \\
Swin-Ti & \cmark & $49.8 \pm 0.6$ & $42.8 \pm 0.7$ & $24.2 \pm 0.7$ & $21.4 \pm 0.9$ & $66.4 \pm 0.6$ & $60.5 \pm 0.2$ & $47.8 \pm 0.5$ & $44.6 \pm 0.5$ & $76.0 \pm 0.3$ & $72.7 \pm 0.2$ & $65.7 \pm 0.5$ & $62.1 \pm 0.3$ \\
& & \grntxt{$+8.5$} & \grntxt{$+10.3$} & \grntxt{$+6.8$} & \grntxt{$+9.2$} & \grntxt{$+6.4$} & \grntxt{$+9.2$} & \grntxt{$+8.2$} & \grntxt{$+9.8$} & \grntxt{$+4.3$} & \grntxt{$+6.5$} & \grntxt{$+7.5$} & \grntxt{$+8.5$} \\
\cmidrule(r){1-2}
Swin-S & \xmark & $41.3 \pm 0.6$ & $33.0 \pm 0.1$ & $18.4 \pm 0.7$ & $13.3 \pm 0.5$ & $59.2 \pm 0.1$ & $51.2 \pm 0.5$ & $39.1 \pm 0.2$ & $35.9 \pm 0.3$ & $71.5 \pm 0.2$ & $65.6 \pm 0.1$ & $56.8 \pm 0.5$ & $53.2 \pm 0.2$ \\
Swin-S & \cmark & $48.6 \pm 0.7$ & $39.9 \pm 1.6$ & $22.2 \pm 0.9$ & $16.8 \pm 1.1$ & $64.4 \pm 0.9$ & $57.9 \pm 1.5$ & $43.8 \pm 1.1$ & $42.3 \pm 1.0$ & $75.7 \pm 0.2$ & $71.8 \pm 0.8$ & $63.2 \pm 0.4$ & $60.6 \pm 0.6$ \\
& & \grntxt{$+7.3$} & \grntxt{$+7.0$} & \grntxt{$+3.8$} & \grntxt{$+3.6$} & \grntxt{$+5.1$} & \grntxt{$+6.7$} & \grntxt{$+4.7$} & \grntxt{$+6.4$} & \grntxt{$+4.2$} & \grntxt{$+6.2$} & \grntxt{$+6.4$} & \grntxt{$+7.4$} \\
\cmidrule(r){1-2}
ResNet50 & \xmark & $48.6 \pm 0.6$ & $35.1 \pm 0.4$ & $23.0 \pm 0.7$ & $13.0 \pm 0.3$ & $65.8 \pm 0.4$ & $58.2 \pm 0.3$ & $44.4 \pm 0.6$ & $38.1 \pm 0.5$ & $73.2 \pm 0.2$ & $69.9 \pm 0.2$ & $56.9 \pm 0.1$ & $56.9 \pm 0.1$ \\
ResNet50 & \cmark & $52.3 \pm 0.6$ & $39.5 \pm 0.1$ & $27.4 \pm 0.6$ & $17.6 \pm 0.1$ & $68.5 \pm 0.3$ & $61.9 \pm 0.1$ & $48.5 \pm 0.4$ & $43.7 \pm 0.3$ & $75.2 \pm 0.1$ & $72.4 \pm 0.1$ & $61.7 \pm 0.3$ & $61.7 \pm 0.3$ \\
& & \grntxt{$+3.7$} & \grntxt{$+4.4$} & \grntxt{$+4.4$} & \grntxt{$+4.6$} & \grntxt{$+2.8$} & \grntxt{$+3.8$} & \grntxt{$+4.2$} & \grntxt{$+5.5$} & \grntxt{$+2.0$} & \grntxt{$+2.5$} & \grntxt{$+4.8$} & \grntxt{$+4.8$} \\
\cmidrule(r){1-2}
ResNet101 & \xmark & $47.8 \pm 0.7$ & $37.2 \pm 0.5$ & $20.4 \pm 1.2$ & $14.2 \pm 0.3$ & $64.9 \pm 0.2$ & $58.6 \pm 0.5$ & $41.1 \pm 0.5$ & $38.3 \pm 0.7$ & $73.6 \pm 0.3$ & $70.5 \pm 0.3$ & $56.2 \pm 0.4$ & $57.0 \pm 0.5$ \\
ResNet101 & \cmark & $52.3 \pm 0.1$ & $42.2 \pm 0.1$ & $24.7 \pm 0.1$ & $19.2 \pm 0.4$ & $68.8 \pm 0.6$ & $62.9 \pm 0.3$ & $46.4 \pm 1.5$ & $44.3 \pm 0.9$ & $76.0 \pm 0.4$ & $73.7 \pm 0.3$ & $61.0 \pm 1.2$ & $62.6 \pm 0.5$ \\
& & \grntxt{$+4.4$} & \grntxt{$+5.0$} & \grntxt{$+4.3$} & \grntxt{$+5.0$} & \grntxt{$+3.9$} & \grntxt{$+4.3$} & \grntxt{$+5.3$} & \grntxt{$+6.0$} & \grntxt{$+2.4$} & \grntxt{$+3.2$} & \grntxt{$+4.7$} & \grntxt{$+5.7$} \\
\bottomrule
\end{tabular}
}
\end{table}
\Cref{tab:corner-cases} reports accuracy on the corner-cases dataset~\cite{Fatima2025} for models trained with and without \schemename.
The dataset is constructed by pasting objects cropped by their full bounding boxes (which are available for the ImageNet validation set) onto 224$\times$224 infilled backgrounds.
The dataset has three factors: foreground size (56, 84, 112 pixels), spatial position (center, CeX, vs.\ corner, CoX), and background type (original image background, XxO, vs.\ a random background, XxR), yielding $3 \times 2 \times 2$ controlled configurations per model.
Across all architectures, training with \schemename consistently improves robustness to these composition shifts.
For ViT-S/B/L, gains range from roughly $+8$ to over $+27$ percentage points, with the largest improvements occurring in the most challenging settings with foregrounds placed in corners on random backgrounds (e.g., CoR and CeR).
Swin and ResNet models also benefit across all configurations, with increases typically between $+3$ and $+10$ points.
DeiT-S shows small drops on some same-background center cases (CeO/CoO), but still improves notably on random-background conditions (XxR), while DeiT-B/L gain across nearly all settings.
Three trends are apparent.
First, all baselines perform substantially worse when moving from original to random backgrounds and from centered to corner placements, indicating strong background and center biases.
Second, \schemename reduces this sensitivity: the absolute gap between center and corner, and between original and random backgrounds, shrinks for almost all models and sizes.
Third, the relative improvements are especially pronounced for smaller objects and off-center placements, suggesting that \schemename makes models more foreground-focused and less reliant on canonical object scale and position.
\section{\schemename Segmentation Samples}
\begin{figure}[t!]
\centering
\begin{subfigure}{.49\textwidth}
\includegraphics[width=\textwidth]{img/masked_image_examples_train.pdf}
\end{subfigure}
\hfill
\begin{subfigure}{.49\textwidth}
\includegraphics[width=\textwidth]{img/masked_image_examples.pdf}
\end{subfigure}
\caption{ImageNet validation samples (left) and training samples (right) of our segmentation masks with annotated bounding boxes.}
\label{fig:mask-examples}
\end{figure}
We show examples of the automatically generated segmentation masks for a diverse subset of object categories (``ant,'' ``busby,'' ``bell cote,'' ``pickelhaube,'' ``snorkel,'' ``stove,'' ``tennis ``ball,'' and ``volleyball'').
Note that ``busby,'' ``bell cote,'' ``pickelhaube,'' and ``snorkel'' are the four classes with the \textbf{worst} mean box precision and box-to-box IoU on the validation set.
\Cref{fig:mask-examples} (right) illustrates masks from the evaluation split, while \Cref{fig:mask-examples} (left) shows examples from the training split.
Across both sets, the masks accurately isolate foreground objects with clean boundaries, despite large variations in object scale, shape, and appearance, supporting their use for background removal and resampling in our training pipeline.
We find that the main failure cases are:
(\textit{i}) When the ground-truth annotation corresponds to only a part of an object, the predicted mask often expands to cover the entire object rather than the annotated region.
See for example ``busby'' or ``bell cote''.
(\textit{ii}) In images containing multiple instances, some objects may be missed, resulting in incomplete foreground coverage.
This is especially visible for ``busby'' and ``pickelhaube''.
However, note that especially for ``pickelhaube'' the training distribution is noticeably different from the validation distribution, showing many images with just the head instead of groups of people wearing it.
(\textit{iii}) In rare cases, the predicted mask degenerates and covers nearly the entire image, effectively eliminating the background.
This happens in $<10\%$ of all training images, and we do not use the resulting backgrounds for recombination (see \Cref{apdx:infill-ratio}).
\section{\schemename Sample Images}
\begin{table*}[h!]
\begin{table*}[t!]
\centering
\caption{Sample Images from using \schemename on ImageNet.}
\label{tbl:example-images}
@@ -199,7 +479,7 @@ Images show a broad range of spatial placements and scales for the same object,
\end{tabular}
}
\end{table*}
We visualize example infilled images for both LaMa \cite{Suvorov2021} and Attentive Eraser \cite{Sun2024} in \Cref{tab:infill-examples}.
We visualize example infilled images for both LaMa \cite{Suvorov2022} and Attentive Eraser \cite{Sun2025} in \Cref{tab:infill-examples}.
The sidebyside examples show that both methods generally produce visually consistent infills, with many pairs appearing extremely similar at a glance.
We qualitatively find that Attentive Eraser yields slightly sharper textures or more coherent local structure, while LaMa sometimes produces smoother or more homogenized regions.
Across the table, finedetail areas such as foliage, bark, and ground textures reveal the most noticeable differences between the two methods.
@@ -208,6 +488,7 @@ Across the table, finedetail areas such as foliage, bark, and ground textures
\FloatBarrier
\newpage
\section{Image Infill Ratio}
\label{apdx:infill-ratio}
\begin{table*}[h!]
\centering
\caption{Example infills with a large relative foreground area size that is infilled (infill ratio).}

View File

@@ -1,11 +1,21 @@
% !TeX root = ../main.tex
\section{Discussion \& Conclusion}
\section{Conclusion \& Future Work}
\label{sec:conclusion}
We introduce \schemename, a novel data augmentation scheme that facilitates improved Transformer training for image classification.
By explicitly separating and recombining foreground objects and backgrounds, \schemename enables controlled data augmentation beyond existing image compositions, leading to significant performance gains on ImageNet and downstream fine-grained classification tasks.
Furthermore, \schemename provides a powerful framework for analyzing model behavior and quantifying biases, including background robustness, foreground focus, center bias, and size bias.
Our experiments demonstrate that training using \schemename not only boosts accuracy but also significantly reduces these biases, resulting in more robust and generalizable models.
In the future, we see \schemename be also applied to other datasets and tasks, like video recognition or segmentation.
\schemename's ability to both improve performance and provide insights into model behavior makes it a valuable tool for advancing CV research and developing more reliable AI systems.
% We introduce \schemename, a novel data augmentation scheme that facilitates improved Transformer training for image classification.
% By explicitly separating and recombining foreground objects and backgrounds, \schemename enables controlled data augmentation beyond existing image compositions, leading to significant performance gains on ImageNet and downstream fine-grained classification tasks.
% Furthermore, \schemename provides a powerful framework for analyzing model behavior and quantifying biases, including background robustness, foreground focus, center bias, and size bias.
% Our experiments demonstrate that training using \schemename not only boosts accuracy but also significantly reduces these biases, resulting in more robust and generalizable models.
% In the future, we see \schemename be also applied to other datasets and tasks, like video recognition or segmentation.
% \schemename's ability to both improve performance and provide insights into model behavior makes it a valuable tool for advancing CV research and developing more reliable AI systems.
We introduced \schemename, a controlled composition augmentation scheme that factorizes images into foreground objects and backgrounds and recombines them with explicit control over background identity, object position, and object scale.
% Empirically, \schemename consistently improves clean accuracy and robustness across architectures and scales.
Across diverse architectures, training with \schemename on top of standard strong augmentations yields substantial gains on ImageNet (up to $+6$ p.p.) and fine-grained downstream tasks (up to $+7.3$ p.p.), and consistently improves robustness on well-recognized benchmarks (up to $+19$ p.p.).
\schemename's compositional controls additionally provide a framework for analyzing model behavior and quantify biases, including background robustness, foreground focus, center bias, and size bias.
This dual role of \schemename as both a training mechanism and an evaluation tool highlights the value of explicit compositional factorization in understanding and improving image classifiers.
In future work, we aim to extend controlled composition beyond classification to multi-object and dense prediction settings, including detection, segmentation, and video recognition.
% By coupling performance gains with interpretable, controllable evaluations, \schemename offers a practical data-centric tool for advancing robust and reliable computer vision systems.
More generally, we believe that designing augmentations around explicitly controllable and interpretable generative setups is a promising direction for building robust and reliable vision systems.

View File

@@ -1,231 +1,50 @@
% !TeX root = ../main.tex
\section{Experiments}
\label{sec:experiments}
% \begin{itemize}
% \item [1.] Training on RecombiNet
% \item ImageNet results (large)
% \item Ablation (TinyImageNet): Foreground position
% \item Ablation (TinyImageNet): Which background (or part of other ablation table?)
% \item Ablation (TinyImageNet+ImageNet For edge blur): Design decisions: Which infill model, pruning threshold, p$\to$t /t$\to$p, foreground rotation range (?), edge blur, original image probability/schedule, Foreground size
% \item With other Data Augmentations
% \item [2.] More evalution metrics
% \item Background accuracy (how to frame/sell? Background bias?) / Background robustness (= foreground with all background)?
% \item Foreground focus
% \item Position bias
% \item Size bias
% \end{itemize}
We conduct a comprehensive suit of experiments to validate the effectiveness of our approach,
% We compare training on \name, the ImageNet instantiation of \schemename, to training on ImageNet for 10 different models.
comparing ImageNet-training with and without \schemename for 10 different models.
Furthermore, we assess the impact of using \schemename for pretraining on multiple fine-grained downstream datasets.
Finally, we exploit \schemename's control over the image distribution to quantify model behaviors and biases.
We always report the mean and standard deviation of three independent training runs.
\subsection{Design Choices of ForAug}
\label{sec:ablation}
We start by ablating the design choices of \schemename on TinyImageNet~\cite{Le2015}, a subset of ImageNet containing 200 categories with 500 images each. %, and Tiny\name, the application of \schemename to TinyImageNet.
% \Cref{tab:ablation} presents the results of these ablations.
\Cref{tab:ablation-segment} presents ablations for segmentation and \Cref{tab:ablation-recombine} for recombination.
\begin{table}
\caption{Ablation of the design decisions in the segmentation phase of \schemename on TinyImageNet.
The first line is our baseline, while the other lines are using \schemename.
We use basic settings with the \emph{same} background strategy during recombination for this experiment.
\begin{figure}[t]
\begin{minipage}[t]{.62\textwidth}
\captionof{table}{ImageNet results when training ViTs with different data augmentation pipelines.
\schemename consistently improves performance in low- and mid-augmentation regimes and remains complementary to strong augmentation pipelines, with larger gains for larger models.
}
\label{tab:ablation-segment}
\label{tab:imagenet-pipelines}
\centering
\small
\resizebox{.9\columnwidth}{!}{
\begin{tabular}{cccc}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccccc}
\toprule
\multirow{2.5}{*}{\makecell{Detect. \\Prompt}} & \multirow{2.5}{*}{\makecell{Infill \\ Model}} & \multicolumn{2}{c}{TinyImageNet Accuracy [\%]} \\
\cmidrule{3-4}
& & ViT-Ti & ViT-S \\
\multirow{2.5}{*}{Augmentation} & \multirow{2.5}{*}{MixUp} & \multirow{2.5}{*}{CutMix} & \multicolumn{3}{c}{Accuracy [\%] using} \\
\cmidrule(l){4-6}
& & & ViT-S & ViT-B & ViT-L \\
\midrule
\multicolumn{2}{l}{\textbf{TinyImageNet}} & $66.1 \pm 0.5$ & $68.3 \pm 0.7$ \\
specific & LaMa \cite{Suvorov2021} & $65.5 \pm 0.4$ & $71.2 \pm 0.5$ \\
general & \gtxt{LaMa \cite{Suvorov2021}} & $66.4 \pm 0.6$ & $72.9 \pm 0.6$ \\
\gtxt{general} & Att. Eraser \cite{Sun2024} & $67.5 \pm 1.2$ & $72.4 \pm 0.5$ \\
Basic & \xmark & \xmark & $71.9 \pm 0.1$ & $69.5 \pm 0.2$ & $68.3 \pm 0.4$ \\
Basic + \schemename & \xmark & \xmark & $75.7 \pm 0.2$ & $75.5 \pm 0.6$ & $73.1 \pm 1.7$ \\
& & & \grntxt{$+3.8$} & \grntxt{$+6.0$} & \grntxt{$+4.8$} \\
\midrule
RandAugment & \xmark & \xmark & $76.3 \pm 0.5$ & $75.5 \pm 0.2$ & $74.7 \pm 0.4$ \\
RandAugment + \schemename & \xmark & \xmark & $78.0 \pm 0.1$ & $77.8 \pm 0.1$ & $78.0 \pm 0.6$ \\
& & & \grntxt{$+1.7$} & \grntxt{$+2.3$} & \grntxt{$+3.3$} \\
\midrule
Basic & \cmark & \cmark & $79.8 \pm 0.3$ & $78.6 \pm 0.4$ & $78.1 \pm 1.6$ \\
Basic + \schemename & \cmark & \cmark & $79.8 \pm 0.3$ & $81.6 \pm 0.5$ & $81.0 \pm 0.4$ \\
& & & \gtxt{$\pm 0.0$} & \grntxt{$+3.0$} & \grntxt{$+2.9$} \\
\midrule
3-Augment & \xmark & \cmark & $79.1 \pm 0.1$ & $77.6 \pm 0.2$ & $75.3 \pm 0.4$ \\
3-Augment + \schemename & \xmark & \cmark & $81.4 \pm 0.1$ & $81.1 \pm 0.4$ & $79.8 \pm 0.1$ \\
& & & \grntxt{$+2.3$} & \grntxt{$+3.5$} & \grntxt{$+4.5$} \\
\midrule
RandAugment & \cmark & \cmark & $80.1 \pm 0.1$ & $81.9 \pm 0.3$ & $79.3 \pm 2.3$ \\
RandAugment + \schemename & \cmark & \cmark & $80.0 \pm 0.3$ & $81.9 \pm 0.2$ & $82.4 \pm 0.1$ \\
& & & \gtxt{$-0.1$} & \gtxt{$\pm 0.0$} & \grntxt{$+3.1$} \\
\bottomrule
\end{tabular}}
\end{table}
\begin{table}[t]
\caption{Ablation of the recombination phase of \schemename on TinyImageNet (top) and ImageNet (bottom). The first experiments use the initial segmentation settings with LaMa \cite{Suvorov2021}.}
\label{tab:ablation-recombine}
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{ccccccccccc}
\toprule
% FG. & Augment. & BG. & BG. & Edge & Original & \multicolumn{2}{c}{Accuracy [\%]} \\
% Size & Order & Strat. & Prune & Smoothing & Mixing & ViT-Ti & ViT-S \\
\multirow{2.5}{*}{\makecell{FG. \\size}} & \multirow{2.5}{*}{\makecell{Augment.\\Order}} & \multirow{2.5}{*}{\makecell{BG\\Strat.}} & \multirow{2.5}{*}{\makecell{BG.\\Prune}} & \multirow{2.5}{*}{\makecell{Original\\Mixing}} & \multirow{2.5}{*}{\makecell{Edge\\Smooth.}} & \multicolumn{2}{c}{Accuracy [\%]} \\
\cmidrule{7-8}
& & & & & & ViT-Ti & ViT-S \\
\midrule
% TinyImageNet & & & & & & & $66.1\pm0.5$ & $68.3\pm0.7$ \\
\multicolumn{6}{l}{\textbf{TinyImageNet}} & \gtxt{$66.1\pm0.5$} & \gtxt{$68.3\pm0.7$} \\
mean & crop$\to$paste & same & - & - & \gtxt{-} & $64.6\pm0.5$ & $70.0\pm0.6$ \\
range & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $65.5\pm0.4$ & $71.2\pm0.5$ \\
\midrule
% \gtxt{range} & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $66.4\pm0.6$ & $72.9\pm0.6$ \\
{range} & {crop$\to$paste} & {same} & {-} & {-} & {-} & $67.5\pm1.2$ & $72.4\pm0.5$ \\
\gtxt{range} & paste$\to$crop & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.1\pm1.2$ & $72.9\pm0.5$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 1.0 & \gtxt{-} & \gtxt{-} & $67.0\pm1.2$ & $73.0\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.8 & \gtxt{-} & \gtxt{-} & $67.2\pm1.2$ & $72.9\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.6 & \gtxt{-} & \gtxt{-} & $67.5\pm1.0$ & $72.8\pm0.7$ \\
% \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 2.0$ & \gtxt{-} & $67.2\pm0.4$ & $72.9\pm0.5$ \\
% \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{-} & $65.9\pm0.5$ & $72.4\pm0.6$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.2$ & \gtxt{-} & $69.8\pm0.5$ & $75.0\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.33$ & \gtxt{-} & $69.5\pm0.4$ & $75.2\pm1.0$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.5$ & \gtxt{-} & $70.3\pm1.0$ & $74.2\pm0.2$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & linear & \gtxt{-} & $70.1\pm0.7$ & $74.9\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & reverse lin. & \gtxt{-} & $67.6\pm0.2$ & $73.2\pm0.3$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & cos & \gtxt{-} & $71.3\pm1.0$ & $75.7\pm0.8$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & $70.0\pm0.8$ & $75.5\pm0.7$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & orig. & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $67.2\pm0.9$ & $69.9\pm1.0$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $70.1\pm0.7$ & $77.5\pm0.6$ \\
\midrule
\multicolumn{6}{l}{\textbf{ImageNet}} & \gtxt{-} & \gtxt{$79.1\pm0.1$} \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & \gtxt{-} & - & $80.5\pm0.1$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & - & $80.7\pm0.1$ \\
\gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & - & $81.4\pm0.1$ \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Prompt.}
% We present the ablation of our main design decisions in \Cref{tab:ablation}.
First, we evaluate the type of prompt used to detect the foreground object.
Here, the \emph{general} prompt, which contains the class and the more general object category, outperforms only having the class name (\emph{specific}).
\textbf{Inpainting.} Among inpainting models, Attentive Eraser~\cite{Sun2024} produces slightly better results compared to LaMa~\cite{Suvorov2021} ($+0.5$ p.p. on average).
For inpainting examples, see the supplementary material.
% (see the supplementary material for examples).
% When comparing the infill models, the GAN-based LaMa \cite{Suvorov2021} gets outperformed by the Attentive Eraser \cite{Sun2024}.
\textbf{Foreground size}
% We observe that LaMa's often infills unnatural textures compared to Attentive Eraser.
% The size of foreground objects during training has a significant impact on the performance.
% Here, using the greater variability of the \emph{range} strategy increases the performance by $\approx 1\%$ compared to the \emph{mean} strategy.
significantly impacts performance.
Employing a \emph{range} of sizes during recombination, rather than a fixed \emph{mean} size, boosts accuracy by approximately 1 p.p.
This suggests that the added variability is beneficial.
\textbf{Order of data augmentation.}
% (1) Applying the image crop related augmentations \emph{before} pasting the foreground object and the color-based ones \emph{after} pasting or (2) applying all data augmentations after pasting the foreground object.
% While results are ambiguous, we choose the second strategy, as it improves the performance of ViT-S, although not the one of ViT-Ti.
Applying all augmentations after foreground-background recombination (\emph{paste$\to$crop$\to$color}) improves ViT-S's performance compared to applying crop-related augmentations before pasting (\emph{crop$\to$paste$\to$color}).
ViT-Ti results are ambiguous.
\textbf{Background pruning.}
When it comes to the backgrounds to use, we test different pruning thresholds ($t_\text{prune}$) to exclude backgrounds with large inpainting.
% and only use backgrounds with an relative size of the infilled region of at most $t_\text{prune}$ (exclusive).
A threshold of $t_\text{prune}=1.0$ means that we use all backgrounds that are not fully infilled.
% We find that the background pruning does not significantly impact the models' performance.
% We choose $t_\text{prune}=0.8$ for the following experiments to exclude backgrounds that are mostly artificial.
Varying $t_\text{prune}$ has minimal impact.
We choose $t_\text{prune} = 0.8$ to exclude predominantly artificial backgrounds.
% One of the most important design decisions is the mixing of the original dataset with \name.
\textbf{Mixing} \schemename-augmented samples with the original ImageNet data proves crucial.
While constant and linear mixing schedules improve performance over no mixing by $2-3$ p.p. compared to only augmented samples, the cosine annealing schedule proves optimal, boosting accuracy by $3-4$ p.p.
\textbf{Edge smoothing.}
We evaluate the impact of using Gaussian blurring to smooth the edges of the foreground masks.
% Similarly, applying edge smoothing to foreground masks with Gaussian blurring actually hurts performance on Tiny\name, but slightly improves it on \name.
For larger models, this gives us a slight performance boost on the full ImageNet (second to last line in \Cref{tab:ablation-recombine}).
\textbf{Background strategy.}
Another point is the allowed choice of background image for each foreground object.
% We evaluate three different strategies.
% (1) Picking the background from which that specific foreground was originally extracted.
% The major difference to ImageNet when using this setup is the variability in size and position of the foreground object.
% (2) Picking a background that originally had a foreground object of the same class in it.
% Here, we have backgrounds where objects of this type can typically appear while also creating a wider variety of samples due to pairing each foreground object with different backgrounds each time.
% (3) Picking any background.
% This choice has the largest variety of backgrounds, but the backgrounds are not semantically related to the foreground object anymore.
% We find in \Cref{fig:bg-strategy} that choosing only a foreground's original background is the worst choice.
We compare using the original background, a background from the same class, and any background.
These strategies go from low diversity and high shared information content between the foreground and background to high diversity and low shared information content.
For \emph{ViT-Ti}, the latter two strategies perform comparably, while \emph{ViT-S} benefits from the added diversity of using any background.
The same is true when training on the full ImageNet.
\begin{table}
\caption{Accuracy of ViT-S on TinyImageNet (TIN) in percent using \schemename with different foreground position distributions by varying the Bates parameter $\eta$.
The best performance is achieved when using the uniform distribution ($\eta=1$) for training.}
\label{tbl:foreground-eta}
\centering
\small
\resizebox{.9\columnwidth}{!}{
\begin{tabular}{ccccccc}
\toprule
\multirow{2.5}{*}{\makecell{Bates Parameter \\during training}} & \multirow{2.5}{*}{\makecell{TIN \\w/o \schemename}} & \multicolumn{5}{c}{TIN w/ \schemename} \\
\cmidrule(l){3-7}
& & $\eta=-3$ & $-2$ & $1/-1$ & $2$ & $3$ \\
\midrule
Baseline & 68.9 & 60.5 & 60.2 & 60.8 & 62.6 & 63.1 \\
$\eta=-3$ & 71.3 & 79.3 & 79.5 & 79.1 & 79.3 & 79.1 \\
$\eta=-2$ & 71.5 & 80.0 & 78.7 & 79.3 & 79.1 & 78.8 \\
$\eta=1/-1$ & 72.3 & 79.5 & 78.9 & 80.2 & 79.7 & 80.4 \\
$\eta=2$ & 71.3 & 78.2 & 77.8 & 79.1 & 79.6 & 79.9 \\
$\eta=3$ & 71.4 & 77.2 & 76.9 & 78.6 & 79.6 & 79.7 \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Foreground position.}
Finally, we analyze the foreground object's positioning in the image, using a
generalization of the Bates distribution~\cite{Bates1955} with parameter $\eta \in \Z$.
The Bates distribution presents an easy way to sample from a bounded domain with just one hyperparameter that controls its concentration.
$\eta = 1/-1$ corresponds to the uniform distribution; $\eta > 1$ concentrates the distribution around the center; and for $\eta < -1$, the distribution is concentrated at the borders (see supplementary material for details).
% We utilize an extended Bates distribution to sample the position of the foreground object.
% The Bates distribution with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}.
% The larger $\eta$, the more concentrated the distribution is at the center, $\eta < -1$ concentrates the distribution at the edges.
% We extend this concept to $\eta \leq -1$, shifting the distribution away from the center and towards the edges.
When sampling more towards the center of the image, the difficulty of the task is reduced, which reduces performance on TinyImageNet (\Cref{tbl:foreground-eta}).
This is reflected in the performance when evaluating using \schemename with $\eta=2$ and $\eta=3$ compared to $\eta=-1/1$.
We observe a similar reduction for $\eta < -1$.
% This experiment is conducted using the LaMa infill model.
\begin{table}
\caption{Dataset statistics for TinyImageNet and ImageNet with and without \schemename. For \schemename we report the number of foreground/background pairs.}
\label{tab:dataset-stats}
\centering
\resizebox{.9\columnwidth}{!}{
\begin{tabular}{l S[table-format=4.0] S[table-format=7.0] S[table-format=5.0]}
\toprule
Dataset & {Classes} & {\makecell{Training \\ Images}} & {\makecell{Validation \\ Images}} \\
\midrule
TinyImageNet & 200 & 100000 & 10000 \\
TinyImageNet + \schemename & 200 & 99404 & 9915 \\
ImageNet & 1000 & 1281167 & 50000 \\
ImageNet + \schemename & 1000 & 1274557 & 49751 \\
\bottomrule
\end{tabular}}
\end{table}
After fixing the optimal design parameters in \Cref{tab:ablation-segment,tab:ablation-recombine} (last rows), we run \schemename's segmentation step on the entire ImageNet dataset.
\Cref{tab:dataset-stats} shows the resulting dataset statistics.
% The slightly lower number of images in \name is due to \emph{Grounded SAM} returning no or invalid detections for some images.
The slightly reduced image count for \schemename is due to instances where Grounded SAM fails to produce valid segmentation masks.
\subsection{Image Classification Results}
\begin{table}
\caption{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.}
\end{tabular}
}
\end{minipage}
\hfill
\begin{minipage}[t]{.37\textwidth}
\captionof{table}{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.}
\label{tab:imagenet-results}
\centering
\small
\resizebox{.8\columnwidth}{!}{\begin{tabular}{lccc}
\resizebox{\textwidth}{!}{\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& w/o \schemename & w/ \schemename & \\
\midrule
@@ -244,100 +63,137 @@ The slightly reduced image count for \schemename is due to instances where Groun
ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\
\bottomrule
\end{tabular}}
\end{table}
\end{minipage}
\end{figure}
\Cref{tab:imagenet-results} compares the ImageNet performance of models trained with and without \schemename.
We adopt the training setup of \cite{Nauen2025} and \cite{Touvron2022} for training ViT \cite{Dosovitskiy2021}, Swin \cite{Liu2021} and ResNet \cite{He2016} (representing CNNs) models as well as the setup of DeiT \cite{Touvron2021b} for that model.
Both setups are using strong data augmentations like RandAugment, CutMix, and Mixup optimized for Transformers (details in supplementary material).
% \begin{table}[t]
% \caption{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.}
% \label{tab:imagenet-results}
% \centering
% \begin{subfigure}{.41\textwidth}
% \resizebox{\textwidth}{!}{\begin{tabular}{lccc}
% \toprule
% \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
% \cmidrule(lr){2-3}
% & w/o \schemename & w/ \schemename & \\
% \midrule
% ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{$+2.3$} \\
% ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{$+3.5$} \\
% ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{$+4.5$} \\
% \midrule
% Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{$+1.8$} \\
% Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{$+1.2$} \\
% \bottomrule
% \end{tabular}}
% \end{subfigure}
% \hspace{5pt}
% \begin{subfigure}{.448\textwidth}
% \resizebox{\textwidth}{!}{\begin{tabular}{lccc}
% \toprule
% \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\
% \cmidrule(lr){2-3}
% & w/o \schemename & w/ \schemename & \\
% \midrule
% DeiT-S & $80.1 \pm 0.1$ & $80.0\pm0.3$ & \gtxt{$-0.1$} \\
% DeiT-B & $81.9 \pm 0.3$ & $81.9\pm0.2$ & \gtxt{$\pm0.0$} \\
% DeiT-L & $79.3\pm2.3$ & $82.4\pm0.1$ & \grntxt{$+3.1$} \\
% \midrule
% ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{$+0.5$} \\
% ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\
% \bottomrule
% \end{tabular}}
% \end{subfigure}
% \end{table}
\section{Experiments}
\label{sec:experiments}
We conduct a comprehensive suit of experiments to validate the effectiveness of our approach,
comparing ImageNet training with and without \schemename for 10 different models and 5 data augmentation pipelines.
Furthermore, we assess the impact of using \schemename for pretraining on multiple fine-grained downstream datasets.
Finally, we exploit \schemename's control over the image distribution to quantify model behaviors and biases.
We always report the mean and standard deviation of three independent training runs.
\subsection{Image Classification Results}
\textbf{ImageNet training.}
\Cref{tab:imagenet-pipelines} analyzes the effect of \schemename under different data augmentation pipelines:
A \emph{basic} pipeline with RandomResizedCrop, Flip and ColorJitter, the \emph{3-Augment} pipeline from \cite{Touvron2022,Nauen2025} that also includes Grayscale, Solarization and GaussianBlur, as well as the widely used \emph{RandAugment}~\cite{Cubuk2020} based pipeline from DeiT~\cite{Touvron2021b}.
Additionally, we include MixUp~\cite{Zhang2018a} and CutMix~\cite{Yun2019} augmentations.
% We also include Mixup and CutMix.
We find that the effectiveness of \schemename depends on the interplay between model capacity and baseline augmentation strength.
When the baseline augmentation is weak or moderate, \schemename consistently improves ImageNet accuracy, with gains increasing for larger ViT models (up to $+6.0$ p.p.\ for ViT-B).
As the augmentation pipeline becomes stronger (e.g., RandAugment with MixUp and CutMix), ImageNet improvements diminish for smaller models, indicating that the baseline augmentation already saturates their capacity.
Importantly, even in cases where ImageNet accuracy does not improve, we consistently observe gains during downstream fine-tuning (see \Cref{tab:downstream-results}), suggesting that \schemename enhances representation quality beyond what is reflected by ImageNet accuracy.
\Cref{tab:imagenet-results} additionally compares performance of different model architectures.
ViT~\cite{Dosovitskiy2021}, Swin~\cite{Liu2021} and ResNet~\cite{He2016} (representing CNNs) are trained using the ``3-augment'' strategy, while DeiT~\cite{Touvron2021b} is trained using the ``RandAugment'' strategy.
Notably, \schemename improves performance across all tested architectures, including the ResNet models, % (up to $1$ p.p.),
demonstrating benefits beyond Transformers.
For DeiT we only observe benefits on ImageNet for the larger models.
For other transformers, we observe improvements from $1.2$ p.p. to $4.5$ p.p. with increasing gains for larger models.
% This improvement is more substantial for the larger models, with ViT-L gaining $4.5$ p.p. in accuracy.
\schemename's improvements counteract the drop in performance for increasing model sizes.
Without \schemename this drop is $3.8$ p.p. (ViT-S to L), while with \schemename it is reduced to $1.6$ p.p.
For DeiT there is a drop of $0.8$ p.p. from small to large while when using \schemename there is a \emph{gain} of $2.4$ p.p.
\begin{table}
\caption{Comparison of \schemename and simple Copy-Paste methods. We train ViT-S on ImageNet using the same 3-augment data augmentation on top of the copy-paste augmentation.}
\label{tab:copy-paste-comparison}
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{lcc S[table-format=+2.1,retain-explicit-plus,detect-inline-weight=math,detect-weight=true]}
\toprule
Augmentation & labels & \makecell{ Accuracy [\%]} & {\makecell{Delta \\to Prev.}} \\
\midrule
% Baseline & & $79.1 \pm 0.1$ \\
Baseline + \textbf{Simple Copy-Paste} & bg & $31.3 \pm 0.6$ & \\
+ mixed labels & fg + bg & $32.0 \pm 0.8$ & +0.7 \\
+ fg labels & fg & $31.6 \pm 0.9$ & -0.4 \\
+ \emph{range} foreground size variation & \gtxt{fg} & $43.0 \pm 1.2$ & \bfseries +11.4 \\
+ infilled backgrounds & \gtxt{fg} & $68.7 \pm 0.2$ & \bfseries +25.7 \\
+ \emph{cos} mixing strategy & \gtxt{fg} & $81.2 \pm 0.1$ & \bfseries +12.5 \\
+ edge smoothing & \gtxt{fg} & $81.3 \pm 0.1$ & +0.1 \\
+ background pruning$=$ \textbf{\schemename} & \gtxt{fg} & $81.4 \pm 0.1$ & +0.1 \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Comparison to Simple Copy-Paste.}
We compare \schemename to a simple adaption of the Copy-Paste augmentation inspired by \cite{Ge2023,Ghiasi2020,Shermaine2025} in \Cref{tab:copy-paste-comparison}.
Contrary to semantic segmentation we do not have foreground masks available.
Thus, we paste the extracted foreground objects from \emph{\schemename's segmentation stage} onto normal ImageNet images.
% Since such images do not have straight forward classification labels, we test multiple possibilities.
We observe 3 large jumps in accuracy: (\textbf{1}) From our \emph{range} foreground size variation (+11.4\%), (\textbf{2}) from using our infilled backgrounds instead of images from the dataset (+25.7\%), and (\textbf{3}) from our \emph{cos} mixing strategy with non-augmented images (+12.5\%).
\schemename's changes to the naive copy-paste augmentation are thus imperative for good classification performance.
% We find that \schemename's improvements counteract the drop in performance for increasing model sizes.
% Without \schemename this drop is $3.8$ p.p. (ViT-S to L), while with \schemename it is reduced to $1.6$ p.p.
% For DeiT there is a drop of $0.8$ p.p. from small to large while when using \schemename there is a \emph{gain} of $2.4$ p.p.
\begin{table}[t]
\caption{Downstream accuracy in percent when finetuning on other datasets. Models are pretrained on ImageNet with and without \schemename. Pretraining using \schemename increases transformer downstream accuracy.
% on all datasets.
}
\label{tab:downstream-results}
\centering
\resizebox{\columnwidth}{!}{\begin{tabular}{lcccccc}
\begin{subfigure}{.48\columnwidth}
\resizebox{\textwidth}{!}{\begin{tabular}{lcccccc}
\toprule
Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\
\midrule
ViT-S & \xmark & $72.4\pm1.0$ & $89.8\pm0.3$ & $94.5\pm0.2$ & $89.1\pm0.1$ & $93.8\pm0.2$ \\
ViT-S & \cmark & $78.6\pm0.5$ & $92.2\pm0.2$ & $95.5\pm0.2$ & $89.6\pm0.1$ & $94.5\pm0.2$ \\
& & \grntxt{$+6.2$} & \grntxt{$+2.4$} & \grntxt{$+1.0$} & \grntxt{$+0.5$} & \grntxt{$+0.7$} \\
\cmidrule(r){1-1}
\midrule
ViT-B & \xmark & $71.7\pm0.5$ & $90.0\pm0.2$ & $94.8\pm0.4$ & $89.8\pm0.2$ & $94.1\pm0.4$ \\
ViT-B & \cmark & $79.0\pm2.2$ & $93.3\pm0.1$ & $ 96.5\pm0.1$ & $90.9\pm0.1$ & $95.1\pm0.4$ \\
& & \grntxt{$+7.3$} & \grntxt{$+3.3$} & \grntxt{$+1.7$} & \grntxt{$+1.1$} & \grntxt{$+1.0$} \\
\cmidrule(r){1-1}
\midrule
ViT-L & \xmark & $72.1\pm1.0$ & $88.8\pm0.3$ & $94.4\pm0.3$ & $90.1\pm0.2$ & $94.2\pm0.4$ \\
ViT-L & \cmark & $77.6\pm1.2$ & $89.1\pm0.2$ & $96.6\pm0.1$ & $91.3\pm0.1$ & $95.1\pm0.1$ \\
& & \grntxt{$+5.5$} & \grntxt{$+0.3$} & \grntxt{$+2.2$} & \grntxt{$+1.2$} & \grntxt{$+0.9$} \\
\midrule
Swin-Ti & \xmark & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\
Swin-Ti & \cmark & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\
& & \grntxt{$+4.1$} & \grntxt{$+2.5$} & \grntxt{$+0.3$} & \grntxt{$+0.4$} & \grntxt{$+0.6$} \\
\midrule
Swin-S & \xmark & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\
Swin-S & \cmark & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\
& & \grntxt{$+5.7$} & \grntxt{$+2.1$} & \grntxt{$+1.4$} & \gtxt{$+0.1$} & \grntxt{$+0.5$} \\
\bottomrule
\end{tabular}}
\end{subfigure}
\hfill
\begin{subfigure}{.505\columnwidth}
\resizebox{\textwidth}{!}{\begin{tabular}{lcccccc}
\toprule
Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\
\midrule
DeiT-S & \xmark & $75.3\pm0.4$ & $91.1\pm0.2$ & $94.8\pm0.4$ & $89.2\pm0.2$ & $92.4\pm0.2$ \\
DeiT-S & \cmark & $76.8\pm0.8$ & $91.9\pm0.2$ & $95.2\pm0.3$ & $89.1\pm0.2$ & $92.3\pm0.4$ \\
& & \grntxt{$+1.5$} & \grntxt{$+0.8$} & \grntxt{$+0.4$} & \gtxt{$-0.1$} & \gtxt{$-0.1$} \\
\cmidrule(r){1-1}
\midrule
DeiT-B & \xmark & $77.0\pm1.2$ & $92.9\pm0.2$ & $96.1\pm0.2$ & $91.2\pm0.1$ & $93.3\pm0.4$ \\
DeiT-B & \cmark & $79.3\pm0.3$ & $93.1\pm0.1$ & $96.4\pm0.2$ & $91.3\pm0.1$ & $93.3\pm0.1$ \\
& & \grntxt{$+2.3$} & \gtxt{$+0.2$} & \grntxt{$+0.3$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
\cmidrule(r){1-1}
\midrule
DeiT-L & \xmark & $72.8\pm5.5$ & $92.8\pm1.0$ & $95.8\pm1.5$ & $90.5\pm2.6$ & $92.4\pm2.0$ \\
DeiT-L & \cmark & $78.8\pm0.8$ & $93.8\pm0.2$ & $97.0\pm0.2$ & $92.0\pm0.2$ & $93.5\pm0.2$ \\
& & \grntxt{$+6.0$} & \grntxt{$+1.0$} & \grntxt{$+1.2$} & \grntxt{$+1.5$} & \grntxt{$+1.1$} \\
\midrule
Swin-Ti & \xmark & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\
Swin-Ti & \cmark & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\
& & \grntxt{$+4.1$} & \grntxt{$+2.5$} & \grntxt{$+0.3$} & \grntxt{$+0.4$} & \grntxt{$+0.6$} \\
\cmidrule(r){1-1}
Swin-S & \xmark & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\
Swin-S & \cmark & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\
& & \grntxt{$+5.7$} & \grntxt{$+2.1$} & \grntxt{$+1.4$} & \gtxt{$+0.1$} & \grntxt{$+0.5$} \\
\midrule
ResNet-50 & \xmark & $78.2\pm0.5$ & $89.8\pm0.2$ & $91.7\pm0.4$ & $84.4\pm0.2$ & $93.7\pm0.3$ \\
ResNet-50 & \cmark & $80.3\pm0.4$ & $90.4\pm0.2$ & $91.7\pm0.2$ & $84.5\pm0.2$ & $93.7\pm0.3$ \\
& & \grntxt{$+2.1$} & \grntxt{$+0.6$} & \gtxt{$\pm0.0$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\
\cmidrule(r){1-1}
\midrule
ResNet-101 & \xmark & $78.4\pm0.6$ & $90.3\pm0.1$ & $91.2\pm0.5$ & $86.0\pm0.2$ & $94.3\pm0.2$ \\
ResNet-101 & \cmark & $81.4\pm0.5$ & $91.3\pm0.1$ & $92.9\pm0.2$ & $86.3\pm0.1$ & $94.0\pm0.3$ \\
& & \grntxt{$+3.0$} & \grntxt{$+1.3$} & \grntxt{$+1.7$} & \grntxt{$+0.3$} & \textcolor{red}{$-0.3$} \\
\bottomrule
\end{tabular}}
\end{subfigure}
\end{table}
\textbf{Downstream tasks.} To assess the transferability of \schemename-trained models, we finetune models pretrained on ImageNet with and without \schemename on five fine-grained datasets:
@@ -347,19 +203,97 @@ In \Cref{tab:downstream-results} we see transformer accuracies improve on all th
% and a reduction of error rate of up to $39.3\%$.
% Notably, training with \name increases the downstream performance of DeiT-S and DeiT-B, even though the ImageNet results were the same.
% This demonstrates that the improved representations from training on \name translate to superior performance beyond gains from better ImageNet performance.
Notably, training with \schemename boosts the downstream performance of DeiT-S and DeiT-B, despite similar ImageNet results.
This shows the improved representations from training with \schemename translate to gains beyond better ImageNet scores.
Notably, training with \schemename boosts the downstream performance of DeiT-S and DeiT-B, despite similar ImageNet accuracy.
This shows, that the improved representations from training with \schemename translate to gains beyond better ImageNet scores.
% not only on ImageNet, but also on fine-grained image classification tasks.
\begin{table}[t]
\caption{Evaluation of models trained on ImageNet with and without \schemename. \schemename generally increases models' robustness to different image distribution shifts. Note that ViT-S \emph{with} \schemename outperforms DeiT-S, the only model where \schemename does not increase robustness.}
\label{tab:robustness-datasets}
\begin{subfigure}{.485\textwidth}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccccccc}
\toprule
Model & w/ \schemename & IN-Hard & IN-A & IN-C & IN-R & IN-V2 \\
\midrule
ViT-S & \xmark & $18.1 \pm 0.6$ & $18.8 \pm 0.2$ & $44.7 \pm 0.8$ & $41.6 \pm 0.6$ & $67.3 \pm 0.4$ \\
ViT-S & \cmark & $21.0 \pm 0.4$ & $26.5 \pm 0.4$ & $52.6 \pm 0.6$ & $49.8 \pm 0.3$ & $70.6 \pm 0.1$ \\
& & \grntxt{$+2.9$} & \grntxt{$+7.7$} & \grntxt{$+7.9$} & \grntxt{$+8.1$} & \grntxt{$+3.3$} \\
\midrule
ViT-B & \xmark & $17.0 \pm 0.4$ & $15.8 \pm 0.7$ & $40.4 \pm 0.8$ & $38.4 \pm 0.7$ & $65.1 \pm 0.6$ \\
ViT-B & \cmark & $22.0 \pm 0.9$ & $31.9 \pm 1.5$ & $51.6 \pm 1.8$ & $48.7 \pm 1.7$ & $70.3 \pm 0.9$ \\
& & \grntxt{$+5.0$} & \grntxt{$+16.0$} & \grntxt{$+11.2$} & \grntxt{$+10.3$} & \grntxt{$+5.2$} \\
\midrule
ViT-L & \xmark & $15.6 \pm 0.4$ & $11.3 \pm 0.9$ & $38.4 \pm 1.0$ & $36.8 \pm 0.8$ & $61.6 \pm 0.8$ \\
ViT-L & \cmark & $20.6 \pm 0.1$ & $30.4 \pm 0.5$ & $48.2 \pm 0.7$ & $46.0 \pm 0.4$ & $68.7 \pm 0.3$ \\
& & \grntxt{$+5.0$} & \grntxt{$+19.0$} & \grntxt{$+9.8$} & \grntxt{$+9.3$} & \grntxt{$+7.1$} \\
\midrule
Swin-Ti & \xmark & $16.2 \pm 0.4$ & $15.0 \pm 0.3$ & $36.0 \pm 0.8$ & $36.6 \pm 0.2$ & $65.5 \pm 0.4$ \\
Swin-Ti & \cmark & $18.3 \pm 0.3$ & $20.3 \pm 0.4$ & $41.4 \pm 0.8$ & $41.4 \pm 0.2$ & $68.2 \pm 0.4$ \\
& & \grntxt{$+2.2$} & \grntxt{$+5.4$} & \grntxt{$+5.4$} & \grntxt{$+4.8$} & \grntxt{$+2.7$} \\
\midrule
Swin-S & \xmark & $18.2 \pm 0.3$ & $19.4 \pm 0.3$ & $39.0 \pm 0.7$ & $39.1 \pm 0.2$ & $67.5 \pm 0.1$ \\
Swin-S & \cmark & $20.5 \pm 0.1$ & $27.7 \pm 0.4$ & $45.6 \pm 0.8$ & $44.1 \pm 0.3$ & $69.6 \pm 0.1$ \\
& & \grntxt{$+2.2$} & \grntxt{$+8.4$} & \grntxt{$+6.6$} & \grntxt{$+5.0$} & \grntxt{$+2.2$} \\
\bottomrule
\end{tabular}
}
\end{subfigure}
\hfill
\begin{subfigure}{.505\textwidth}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccccccc}
\toprule
Model & w/ \schemename & IN-Hard & IN-A & IN-C & IN-R & IN-V2 \\
\midrule
DeiT-S & \xmark & $19.5 \pm 0.2$ & $18.4 \pm 0.3$ & $58.8 \pm 0.7$ & $43.0 \pm 0.1$ & $68.8 \pm 0.2$ \\
DeiT-S & \cmark & $18.5 \pm 0.5$ & $17.3 \pm 1.0$ & $57.0 \pm 0.9$ & $43.8 \pm 0.2$ & $68.7 \pm 0.6$ \\
& & \rdtxt{$-1.0$} & \rdtxt{$-1.1$} & \rdtxt{$-1.8$} & \grntxt{$+0.8$} & \gtxt{$-0.1$} \\
\midrule
DeiT-B & \xmark & $22.6 \pm 0.2$ & $26.0 \pm 0.2$ & $62.1 \pm 1.0$ & $45.6 \pm 1.9$ & $70.6 \pm 0.9$ \\
DeiT-B & \cmark & $22.6 \pm 0.2$ & $25.0 \pm 0.3$ & $62.8 \pm 0.6$ & $47.7 \pm 0.8$ & $70.8 \pm 0.5$ \\
& & \gtxt{$\pm 0.0$} & \rdtxt{$-1.0$} & \grntxt{$+0.8$} & \grntxt{$+2.0$} & \gtxt{$+0.2$} \\
\midrule
DeiT-L & \xmark & $21.2 \pm 2.0$ & $20.2 \pm 3.4$ & $59.3 \pm 4.3$ & $41.3 \pm 2.7$ & $66.9 \pm 2.8$ \\
DeiT-L & \cmark & $23.4 \pm 0.3$ & $28.8 \pm 2.0$ & $63.4 \pm 0.7$ & $47.8 \pm 0.6$ & $71.6 \pm 0.5$ \\
& & \grntxt{$+2.2$} & \grntxt{$+8.7$} & \grntxt{$+4.1$} & \grntxt{$+6.5$} & \grntxt{$+4.7$} \\
\midrule
ResNet50 & \xmark & $16.1 \pm 0.2$ & $9.7 \pm 0.1$ & $38.0 \pm 1.0$ & $40.5 \pm 0.6$ & $66.8 \pm 0.4$ \\
ResNet50 & \cmark & $17.2 \pm 0.1$ & $10.8 \pm 0.4$ & $41.0 \pm 0.7$ & $43.7 \pm 0.3$ & $67.5 \pm 0.1$ \\
& & \grntxt{$+1.1$} & \grntxt{$+1.1$} & \grntxt{$+3.0$} & \grntxt{$+3.2$} & \grntxt{$+0.7$} \\
\midrule
ResNet101 & \xmark & $18.2 \pm 0.4$ & $14.3 \pm 0.1$ & $41.7 \pm 0.7$ & $42.3 \pm 0.1$ & $67.7 \pm 0.5$ \\
ResNet101 & \cmark & $19.9 \pm 0.2$ & $17.6 \pm 0.5$ & $46.3 \pm 0.6$ & $46.3 \pm 0.3$ & $69.5 \pm 0.3$ \\
& & \grntxt{$+1.7$} & \grntxt{$+3.2$} & \grntxt{$+4.6$} & \grntxt{$+4.0$} & \grntxt{$+1.8$} \\
\bottomrule
\end{tabular}
}
\end{subfigure}
\end{table}
\subsection{Bias and Robustness Evaluation}
% Additional to just using \name for training, its special properties and posibilities for adjustment of the data distribution make it a valuable tool for evaluating other model properties and biases.
Beyond its use for training, \schemename's unique properties and controlled data generation capabilities make it a powerful tool for analyzing behavior and biases of black-box models.
We exploit this in two complementary ways.
First, we ask whether \schemename-trained models are more robust on \emph{external} ImageNet robustness benchmarks that are not generated by our pipeline.
Second, we use \schemename's fine-grained control for targeted evaluation of specific dimensions of model bias, such as background reliance and center/size bias.
% Together, these experiments allow us to both \emph{probe} and \emph{improve} robustness along clearly defined axes.
% This combination of standard benchmarks and controlled probes allows us to both quantify robustness improvements and attribute them to changes in particular model behaviors.
\begin{figure*}
\textbf{Robustness on External Distribution Shifts.}
\Cref{tab:robustness-datasets} summarizes accuracy on five widely used ImageNet robustness benchmarks: ImageNet-Hard~\cite{Taesiri2023}, ImageNet-A~\cite{Hendrycks2021}, ImageNet-C~\cite{Hendrycks2019}, ImageNet-R~\cite{Hendrycks2021a}, and ImageNetV2~\cite{Recht2019}.
Across ViTs, Swin Transformers, and ResNets, incorporating \schemename during training generally improves robustness to all considered distribution shifts.
For ViTs, the gains are substantial: for example, ViT-B improves from $15.8\%$ to $31.9\%$ accuracy on ImageNet-A ($+16.0$ p.p.) and from $40.4\%$ to $51.6\%$ on ImageNet-C ($+11.2$ p.p.), with similar improvements for ViT-S and ViT-L.
Swin also benefits consistently, with increases of roughly $2$--$8$ p.p. on most benchmarks, and ResNet sees smaller but steady gains (e.g., up to $+4.6$ points on ImageNet-C).
For DeiT, the picture is more nuanced: DeiT-B and DeiT-L still enjoy robustness improvements, whereas DeiT-S exhibits small decreases on several benchmarks.
Interestingly, however, ViT-S trained with \schemename outperforms the DeiT-S baseline.
This suggests that controlled composition can partially close the robustness gap between lightly and heavily regularized models.
Overall, the consistent improvements on corruption-based, natural and hard examples indicate that the compositional invariances induced by \schemename extend beyond the specific foreground/background manipulations used in its construction.
\begin{figure*}[t]
\centering
\includegraphics[width=.95\textwidth]{img/bg_robustness.pdf}
\caption{Evaluation of background robustness on ImageNet + \schemename, ImageNet9 and CounterAnimal.
\includegraphics[width=\textwidth]{img/bg_robustness.pdf}
\caption{Evaluation of background robustness on ImageNet + \schemename, ImageNet9~\cite{Xiao2020} and CounterAnimal~\cite{Wang2024f}.
We plot the in-distribution (top of arrow) and the out-of-distribution (bottom of arrow) accuracy when training with and without \schemename.
We annotate each arrow with its length $\Delta$.
Training with \schemename improves the background robustness of all transformers by mostly boosting the out-of-distribution accuracy.
@@ -377,17 +311,16 @@ We assess the robustness of models to shifts in the background distribution from
% \text{Background Robustness} = \frac{\text{Acc}(\name_\text{all})}{\text{Acc}(\name_\text{same})}
% \end{align}
% It represents the relative drop in performance under a background distribution shift.
\Cref{fig:background-robustness} presents the background robustness results for three datasets: ImageNet with \schemename (all backgrounds vs. backgrounds of same class), ImageNet9 \cite{Xiao2020} (random backgrounds vs. original backgrounds), and CounterAnimal \cite{Wang2024f} (counter vs. common background).
\Cref{fig:background-robustness} presents the background robustness results for three datasets: ImageNet with \schemename (all backgrounds vs. backgrounds of same class), ImageNet9~\cite{Xiao2020} (random backgrounds vs. original backgrounds), and CounterAnimal~\cite{Wang2024f} (counter vs. common background).
The top triangle of each arrow represents the in-distribution backgrounds and the bottom triangle represents the out-of-distribution ones.
We follow ImageNet9 and CounterAnimal and assess the background robustness in terms of the accuracy gap when evaluating a model on images of normal background distribution compared to out-of-distribution backgrounds (length of each arrow; $\Delta$).
% When trained on ImageNet, smaller models generally exhibit greater robustness to changes in the background distribution than larger models and ResNet is more robust than the tested Transformer models.
Crucially, \schemename improves the background robustness of all models and across datasets, reducing the background-gap by boosting the performance on the out-of-background-distribution samples more than the in-distribution ones.
% to $\approx1.00$, meaning that these models are agnostic to the choice of background and only classify based on the foreground.
These findings highlight the generalization benefits of \schemename to unusual image compositions.
We find a similar trend for the Corner-Cases~\cite{Fatima2025} dataset (see supplementary), highlighting the generalization benefits of \schemename to unusual image compositions.
\begin{figure*}
\begin{figure*}[t]
\centering
\includegraphics[width=.95\textwidth]{img/fg_focus.pdf}
\includegraphics[width=\textwidth]{img/fg_focus.pdf}
\caption{Evaluation of the foreground focus (\Cref{eq:fg-focus}) using GradCam, GradCam++ and IntegratedGradients (IG) of models trained on ImageNet. Training with \schemename improves the foreground focus of almost all models.}
\label{fig:foreground-focus}
\end{figure*}
@@ -418,40 +351,53 @@ We hypothesize Swin's below-uniform foreground focus with GradCam is due to its
We calculate center bias according to \Cref{eq:center-bias}.
Using \schemename significantly reduces models' center bias.}
\label{tab:center-bias}
\centering
\resizebox{.78\columnwidth}{!}{
\begin{subfigure}{.48\columnwidth}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& w/o \schemename & w/ \schemename \\
\midrule
ViT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v3.pdf}} \\
ViT-S & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_ImageNet_v3.pdf} & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-S_RecombNet_all_v3.pdf} \\
& $25.5\pm0.8$ & $22.0\pm0.3$ & \grntxt{$-3.5$} \\
ViT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v3.pdf}} \\
ViT-B & {\includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_ImageNet_v3.pdf}} & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-B_RecombNet_all_v3.pdf} \\
& $25.4\pm0.4$ & $19.0\pm0.2$ & \grntxt{$-6.4$} \\
ViT-L & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v3.pdf}} \\
ViT-L & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_ImageNet_v3.pdf} & \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ViT-L_RecombNet_all_v3.pdf} \\
& $24.3\pm1.1$ & $11.7\pm0.7$ & \grntxt{$-12.6$} \\
\midrule
DeiT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v3.pdf}} \\
Swin-Ti & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-Ti_RecombNet_all_v3.pdf}} \\
& $25.0\pm0.7$ & $16.5\pm0.2$ & \grntxt{$-8.5$} \\
Swin-S & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/Swin-S_RecombNet_all_v3.pdf}} \\
& $23.2\pm0.1$ & $15.6\pm0.2$ & \grntxt{$-7.6$} \\
\bottomrule
\end{tabular} }
\end{subfigure}
\hfill
\begin{subfigure}{.497\columnwidth}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccc}
\toprule
\multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\
\cmidrule(lr){2-3}
& w/o \schemename & w/ \schemename \\
\midrule
DeiT-S & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_ImageNet_v3.pdf} } & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_fornet_all_linear_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_fornet_all_linear_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-S_fornet_all_linear_v3.pdf}} \\
& $20.4 \pm 0.2$ & $21.2 \pm 0.1$ & \gtxt{$+0.8$} \\
DeiT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v3.pdf}} \\
DeiT-B & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_ImageNet_v3.pdf} } & {\includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-B_fornet_all_cos_v3.pdf}} \\
& $19.0 \pm 0.7$ & $19.0 \pm 0.2$ & \gtxt{$\pm0.0$} \\
DeiT-L & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v3.pdf} } & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v3.pdf} } \\
DeiT-L & { \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_ImageNet_v3.pdf} } & { \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/DeiT-L_fornet_all_cos_v3.pdf} } \\
& $21.2 \pm 0.2$ & $18.0 \pm 0.2$ & \grntxt{$-3.2$} \\
\midrule
Swin-Ti & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v3.pdf}} \\
& $25.0\pm0.7$ & $16.5\pm0.2$ & \grntxt{$-8.5$} \\
Swin-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v3.pdf}} \\
& $23.2\pm0.1$ & $15.6\pm0.2$ & \grntxt{$-7.6$} \\
\midrule
ResNet50 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v3.pdf}} \\
ResNet50 & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet50_RecombNet_all_v3.pdf}} \\
& $26.3\pm0.3$ & $19.7\pm0.3$ & \grntxt{$-6.6$} \\
ResNet101 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v3.pdf}} \\
ResNet101 & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_ImageNet_v3.pdf}} & {\includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth, valign=c]{img/ResNet101_RecombNet_all_v3.pdf}} \\
& $23.0\pm0.3$ & $19.9\pm0.2$ & \grntxt{$-3.1$} \\
\bottomrule
\end{tabular} }
\includegraphics[width=.8\columnwidth]{img/colorbar_horizontal.pdf}
\end{subfigure}
\centering
\includegraphics[width=.5\columnwidth]{img/colorbar_horizontal.pdf}
\end{table}
\textbf{Center Bias.}
@@ -474,13 +420,14 @@ Performance is generally highest in the center and lowest in the four corners.
Interestingly, ImageNet-trained models perform slightly better when the foreground object is on the right side of the image, compared to the left side, despite our use of random flipping with a probability of $0.5$ during training.
% Training on \name reduces the center bias of all models by at least half.
Using \schemename significantly reduces center bias across models, with a more uniform performance especially across the middle row.
% On corner-cases (see supplementary) we find that
% Their accuracy is higher in the center left and right cells than in the center top and bottom ones, which is not the case for ImageNet-trained models.
% This demonstrates that \schemename promotes a more uniform spatial attention distribution, counteracting the center-bias of ImageNet.
Thus, \schemename makes the model recognize objects across a wider spatial distribution, counteracting the center-bias of ImageNet.
\begin{figure}[t!]
\centering
\includegraphics[width=\columnwidth]{img/size_bias_grid.pdf}
\includegraphics[width=\columnwidth]{img/size_bias_wide.pdf}
\caption{Evaluation of the size bias of models trained on ImageNet. We plot the accuracy relative to the accuracy when using the default size ($f_\text{size} = 1.0$).}
\label{fig:size-bias}
\end{figure}
@@ -492,6 +439,87 @@ We introduce a size factor $f_\text{size}$ by which we additionally scale the fo
Results are normalized by the accuracy when using $f_\text{size} = 1.0$.
\Cref{fig:size-bias} shows the size bias curves of models trained with and without \schemename.
% When training on \name, the resulting model keeps it's good performance on smaller foreground objects, while models trained on ImageNet fall of faster and lower.
Models trained using \schemename maintain perform better, especially with smaller foreground objects.
Models trained using \schemename perform better, especially with smaller foreground objects.
%, when ImageNet-trained models exhibit a more rapid performance decline.
Therefore, \schemename-training improves robustness to variations in object scale, especially for larger models.
\subsection{Design Choices of \schemename}
We next analyze key components of \schemename, focusing on three questions: how it compares to simple copy-paste, how background choice affects performance, and how reliably labels are preserved after recomposition.
Additional ablations over variants and hyperparameters are provided in the supplementary material.
\begin{table}[t]
\caption{Comparison of \schemename and simple Copy-Paste methods. We train ViT-S on ImageNet using the same 3-augment data augmentation on top of the copy-paste augmentation.}
\label{tab:copy-paste-comparison}
\centering
\resizebox{.66\columnwidth}{!}{
\begin{tabular}{lcc S[table-format=+2.1,retain-explicit-plus,detect-inline-weight=math,detect-weight=true]}
\toprule
Augmentation & labels & \makecell{ Accuracy [\%]} & {\makecell{Delta \\to Prev.}} \\
\midrule
% Baseline & & $79.1 \pm 0.1$ \\
3-Augment + \textbf{Simple Copy-Paste} & bg & $31.3 \pm 0.6$ & \\
+ mixed labels & fg + bg & $32.0 \pm 0.8$ & +0.7 \\
+ fg labels & fg & $31.6 \pm 0.9$ & -0.4 \\
+ \emph{range} foreground size variation & \gtxt{fg} & $43.0 \pm 1.2$ & \bfseries +11.4 \\
+ infilled backgrounds & \gtxt{fg} & $68.7 \pm 0.2$ & \bfseries +25.7 \\
+ \emph{cos} mixing strategy & \gtxt{fg} & $81.2 \pm 0.1$ & \bfseries +12.5 \\
+ edge smoothing & \gtxt{fg} & $81.3 \pm 0.1$ & +0.1 \\
+ background pruning$=$ \textbf{\schemename} & \gtxt{fg} & $81.4 \pm 0.1$ & +0.1 \\
\bottomrule
\end{tabular}}
\end{table}
\textbf{Comparison to Simple Copy-Paste.}
We compare \schemename to a simple adaption of the Copy-Paste augmentation inspired by \cite{Ge2023,Ghiasi2021,Shermaine2025} in \Cref{tab:copy-paste-comparison}.
Contrary to semantic segmentation we do not have foreground masks available.
Thus, we paste the extracted objects from \textbf{\schemename's segmentation stage} onto normal ImageNet images.
% Since such images do not have straight forward classification labels, we test multiple possibilities.
We observe 3 large jumps in accuracy: (\textbf{1}) From our \emph{range} foreground size variation (+11.4\%), (\textbf{2}) from using our infilled backgrounds instead of images from the dataset (+25.7\%), and (\textbf{3}) from our \emph{cos} mixing strategy with non-augmented images (+12.5\%).
\schemename's changes to the naive copy-paste augmentation are thus imperative for good classification performance.
\begin{figure}[t]
\begin{minipage}[c]{.49\textwidth}
\centering
\includegraphics[width=\textwidth]{img/strategy.pdf}
\captionof{figure}{We compare Original, Same-class, and All-classes background selection using ViT-Ti and ViT-S backbones on TinyImageNet.
Increasing background diversity consistently improves classification accuracy.
}
\label{fig:background-strategy}
\end{minipage}
\hfill
\begin{minipage}[c]{.49\textwidth}
\centering
\includegraphics[width=\textwidth]{img/mask_expansion.pdf}
\captionof{figure}{
We vary the foreground mask area for TinyImageNet by shrinking or expanding masks relative to the original outline and report accuracy when training on $100\%$ augmented samples.
Performance is stable for expanded masks and degrades rapidly after shrinking masks.
}
\label{fig:mask-expansion}
\end{minipage}
\end{figure}
\textbf{Background Choice Strategy.}
\Cref{fig:background-strategy} shows the effect of background selection on TinyImageNet accuracy, where we trade off diversity against context plausibility.
% Using the original inpainted background yields the lowest accuracy, indicating limited regularization from contextual cues.
% Sampling backgrounds from the same class provides a modest but consistent improvement, suggesting that mild context variation encourages robustness while preserving semantic plausibility.
The best performance is achieved by sampling backgrounds from all classes, which introduces substantial context shifts, but leads to the strongest accuracy gains for both ViT-Ti and ViT-S.
Thus, aggressive background diversification is more important than context plausibility and acts as an effective form of context-based regularization rather than introducing harmful noise.
\textbf{Label Integrity.}
% We assess the label integrity of \schemename, i.e., whether object labels remain correct after recombination, by verifying that the intended object is accurately extracted.
% To this end, we leverage the object bounding box annotations provided in the ImageNet validation set.
% Specifically, we compute the \emph{box precision}, defined as the fraction of the predicted mask area that lies within the ground-truth bounding box, obtaining a mean value of $91\%$.
% In addition, we measure the \emph{box-to-box IoU}, computed as the IoU between the tight bounding box enclosing the predicted mask and the tight bounding box of the ground-truth annotation, which yields a high $76.1\%$.
% Qualitative examples of the predicted masks and bounding boxes are provided in the supplementary material.
% We additionally test label integrity under systematic mask perturbations by expanding or shrinking the foreground masks before composition.
% Concretely, starting from the original outline, we erode or dilate the mask such that the foreground area changes by some percentage.
% \Cref{fig:mask-expansion} shows that accuracy is relatively stable for expanded masks, but drops off significantly for eroded masks, consistent with cropping away semantically important object parts.
% This experiment suggests, that \schemename is relatively robust to artifacts from including an object's original background in the foreground mask.
% Overall, these results indicate that the segmentation stage of \schemename reliably isolates the target class object, thereby preserving label correctness after recombination.
To quantify whether recombined images still depict the intended class, we evaluate the segmentation stage of \schemename on ImageNet validation boxes.
Our predicted masks achieve a mean box precision of $91.0\%$ (fraction of mask area inside the ground-truth bounding boxes of the ImageNet validation set) and a high box-to-box IoU of $76.1\%$, indicating that they tightly capture the target object.
Qualitative examples of the predicted masks and bounding boxes are provided in the supplementary material.
We further probe robustness to mask imprecision by eroding or dilating masks such that the foreground area changes by a fixed percentage before composition.
As shown in \Cref{fig:mask-expansion}, accuracy remains stable for expansions but drops sharply under erosion, consistent with removing semantically important object parts.
Together, these results suggest that (\textit{i}) \schemename reliably isolates the target objects and preserves label integrity and that (\textit{ii}) \schemename is robust to artifacts from an object's original background and degrades mainly when the foreground no longer contains the full object.

View File

@@ -3,71 +3,69 @@
\section{Introduction}
\label{sec:intro}
% \begin{itemize}
% \item General Into Image classification
% \item ImageNet
% \item CNNs $\to$ Transformers
% \item Traditional Data Augmentation: CNNs
% \item Problems with that: Other model properties of Transformers
% \item Our approach: Recombining ImageNet forgrounds and backgrounds
% \end{itemize}
\begin{figure}
% \begin{figure}
% \centering
% \includegraphics[width=.5\columnwidth]{img/fig-1.pdf}
% \caption{\schemename factorizes each training image into a foreground object and a background, then recombines them on the fly while controlling background identity, object position, and object scale. Standard, strong augmentations are applied afterwards.}
% \label{fig:fig-1}
% \end{figure}
\begin{table}[t]
\caption{Examples of \schemename generated images (center cropped) from ImageNet.
We successfully segment even multiple objects (\textit{Macaw}) and complex shapes (\textit{Cricket}).}
\label{tab:foraug-examples}
\centering
\includegraphics[width=\columnwidth]{img/fig-1.pdf}
\caption{Comparison of traditional image classification training and training when using \schemename. \schemename recombines foreground objects with different backgrounds each epoch, thus creating a more diverse training set. We still apply strong traditional data augmentation afterwards.}
\label{fig:fig-1}
\end{figure}
Image classification, a fundamental task in computer vision (CV), involves assigning labels to images from a set of categories.
It underpins a wide range of applications, like medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2022b}, and object recognition~\cite{Carion2020,He2017,Girshick2013} and facilitates large-scale pretraining~\cite{Dosovitskiy2021,Liu2021,Touvron2021b}, and progress evaluation in CV~\cite{Khan2022, Rangel2024}.
% Furthermore, image classification is used for large-scale pretraining of vision models~\cite{Dosovitskiy2021,Liu2021,Touvron2021b} and to judge the progress of the field of CV \cite{Khan2022, Rangel2024}.
The advent of large-scale datasets, particularly ImageNet~\cite{Deng2009}, served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}.
% containing millions of labeled images across thousands of categories, has been instrumental in driving significant progress in this field.
% ImageNet served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}.
% It is used to train and evaluate the best models in the field.
While traditionally, convolutional neural networks (CNNs) have been the go-to architecture in CV, Transformers \cite{Vaswani2017}, particularly the Vision Transformer (ViT) \cite{Dosovitskiy2021}, have emerged as a powerful alternative and go-to architecture, demonstrating
% These attention-based models have demonstrated
superior performance in various vision tasks, including image classification \cite{Wortsman2022,Yu2022,Carion2020,Zong2022,Wang2022a}.
\resizebox{.9\textwidth}{!}{
\begin{tabular}{ccccc}
\toprule
Class & \makecell{Original \\Image} & \makecell{Extracted \\Foreground} & \makecell{Infilled \\Background} & Recombined Examples \\
\midrule
Macaw & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507_v1_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01818515_31507_v1_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v12.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v18.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v3.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v4.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01818515_31507_recombined_v6.JPEG} \\
% Conch & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n01943899_20070_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v9.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v10.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v11.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v12.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v17.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n01943899_20070_recombined_v8.JPEG} \\
Cricket & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n02229544_6170_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v0.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v10.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v16.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v2.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n02229544_6170_recombined_v6.JPEG} \\
Laptop & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615.JPEG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615_fg.PNG} & \includegraphics[max width=.1\columnwidth, max height=2cm, valign=c]{img/appendix_examples/n03642806_3615_bg.JPEG} & \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v0.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v1.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v11.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v14.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v15.JPEG} \includegraphics[width=.1\columnwidth, valign=c]{img/foraug_examples/n03642806_3615_recombined_v2.JPEG} \\
\bottomrule
\end{tabular}
}
\end{table}
Large-scale image classification is a central driver of modern computer vision: it benchmarks progress in computer vision~\cite{Khan2022,Rangel2024}, powers model pretraining~\cite{Dosovitskiy2021,Liu2021,Touvron2021b}, and yields representations that transfer broadly and underpin applications like medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2023a}, and object recognition~\cite{Carion2020,He2017,Girshick2014}.
However, classification supervision is weak in an important sense: the label does not specify \emph{how} the class-object should appear.
In ImageNet~\cite{Deng2009} for example, objects often occur at characteristic positions and scales and co-occur with correlated scene context~\cite{Fatima2025,Barbu2019}.
% In datasets such as ImageNet, objects often occur at characteristic positions and scales and co-occur with correlated scene context~\cite{Fatima2025,Barbu2019}.
As a result, models rely on shortcuts like background cues, center bias, or size bias, that boost in-distribution accuracy but hurt robustness and transfer~\cite{Geirhos2020,Fatima2025,Barbu2019}.
Data augmentation is a key technique for training image classification models.
% A key technique for training image classification models, especially with limited data, is data augmentation.
Traditional augmentation methods, such as cropping, flipping, or color shifts, are commonly employed to increase data diversity~\cite{Xu2023d, Shorten2019}, but remain bound to existing image compositions.
While these preserve the images' semantic meaning, their ability to teach spatial invariances is limited.
% the diversity of the training data and improve the model's performance~\cite{Xu2023d, Shorten2019}.
% These basic transformations, originally designed for CNNs, change the input images in a way that preserves their semantic meaning~\cite{Alomar2023}, but are limited to existing image compositions.
While combinations of these data augmentations are still used today, they originally were proposed to benefit CNNs.
However, the architectural differences of CNNs and Transformers suggest that the latter might benefit from different data augmentation strategies.
In particular, the self-attention mechanism, unlike a CNN, is not translation equivariant~\cite{RojasGomez2023,Ding2023a}, meaning that the model is not designed to understand the spatial relationships between pixels.
% This creates the need for novel data augmentation strategies tailored to the Transformer architecture.
% This fact opens a new design space for data augmentation strategies to help Transformers understand the basic invariances of image classification.
% Note that these traditional data augmentations are also limited by existing image compositions.
Here, data augmentation is the default defense.
Standard transformations (crop/flip/color jitter) and stronger policies such as MixUp~\cite{Zhang2018a}/CutMix~\cite{Yun2019} and automated augmentation search~\cite{Cubuk2019,Cubuk2020} expand appearance diversity~\cite{Shorten2019,Xu2023d}. % , yet they largely preserve the original \emph{composition} of each image~\cite{Shorten2019,Xu2023d}.
However, their ability to teach spatial and compositional invariances is limited.
This constraint matters especially for Vision Transformers (ViTs)~\cite{Dosovitskiy2021}: with weaker built-in spatial inductive biases than Convolutional Neural Networks (CNNs), ViTs must learn key equivariances (e.g., translation and scale robustness) primarily from data.
Copy-paste style augmentations~\cite{Ghiasi2021,Kang2022} alter composition more aggressively by overlaying segmented objects onto other images.
These are typically designed for detection or instance segmentation and rely on dense human annotations available for these tasks or use unconstrained dataset images as backgrounds.
As a result, they do not offer fine-grained control of object position and scale, and they do not explicitly enforce that the pasted background is semantically neutral, creating ambiguous labels for classification.
Recognizing that Transformers need to learn spatial relationships directly from data,
% and in general are usually trained on larger datasets~\cite{Kolesnikov2020},
we propose \schemename, a data augmentation method that makes these relationships explicit by recombining foreground objects with diverse backgrounds.
Thus, \schemename goes beyond existing image compositions and encodes desired invariances directly into the training data (see \Cref{fig:fig-1}).
% Inspired by this inductive bias of CNNs, that is not inherent to ViTs, we propose \schemename, a novel data augmentation scheme for image classification which makes the translation equivariance of CNNs explicit in the training data by recombining foreground objects at varying positions with different backgrounds.
% In this paper, we address the challenge of effectively training Transformers for image classification by proposing \schemename, a novel data augmentation scheme for image classification, which combines foreground objects with different backgrounds.
% Applying \schemename to ImageNet gives rise to \name, a novel dataset that enables this data augmentation with with fine-grained control over the image composition.
Applying \schemename to a dataset like ImageNet is a two-step process:
(1)~We separate the foreground objects in ImageNet from their backgrounds, using an open-world object detector~\cite{Ren2024} and fill in the background in a neutral way using an object removal model~\cite{Sun2024,Suvorov2021}.
(2)~This allows us to then recombine any foreground object with any background on the fly, creating a highly diverse training set.
% During recombination, we can control important parameters, like the size and position of the foreground object, to help the model learn the spatial invariances necessary for image classification.
By exploiting the control over foreground size and position during recombination, \schemename explicitly teaches spatial invariances that image classification models typically must learn implicitly.
We show that using \schemename additionally to strong traditional data augmentation increases the model accuracy of Transformers by up to 4.5 p.p. on ImageNet and reduces the error rate by up to $7.3$ p.p. in downstream tasks.
To encode compositional invariances directly in the training data, we propose \emph{Foreground-Background Augmentation} (\schemename), a controlled composition augmentation that \emph{explicitly factorizes each image into foreground and background, then recombines them for label-preserving, interpretable distribution shifts}.
Concretely, \schemename uses off-the-shelf segmentation and inpainting models to (i) extract a foreground object and synthesize a class-consistent, semantically neutral background, and (ii) paste the foreground onto diverse neutral backgrounds while controlling its position and scale (see \Cref{tab:foraug-examples}).
Unlike prior copy-paste methods that simply overlay objects onto arbitrary scenes~\cite{Ghiasi2021,Ghiasi2021,Kang2022}, \schemename first removes and neutralizes the original background, then samples from well-defined distributions of backgrounds, object positions, and object sizes.
This explicit factorization preserves a clean label for the recombined image while providing direct control over compositions, enabling us to break spurious correlations while still fitting seamlessly into modern strong augmentation pipelines. % (see \Cref{fig:fig-1}).
% Throughout, we apply \schemename on top of strong augmentation pipelines (RandAugment, Mixup, CutMix), so any gains are complementary to these widely used techniques.
% As it is important that any gains are complementary to strong augmentation pipelines (RandAugment, MixUp, CutMix), we apply \schemename on top of these widely used techniques.
To ensure that all gains are complementary to strong augmentation pipelines (RandAugment, MixUp, CutMix), we apply \schemename on top of these widely used techniques.
Beyond training, \schemename becomes a diagnostic tool for analyzing model behavior and biases, when used during evaluation.
We utilize our control over the image distribution to measure a model's background robustness (by varying the choice of background), foreground focus (by leveraging our knowledge about the placement of the foreground object), center bias (by controlling position), and size bias (by controlling size).
These analyses provide valuable insights into model behavior and biases, which is crucial for model deployment and future robustness optimizations.
We show that training using \schemename significantly reduces all of these biases.
We make our code for \schemename and the output of \schemename's segmentation phase on ImageNet publicly available\footnote{Link will go here.} to facilitate further research.
Empirically, \schemename yields consistent accuracy gains across architectures, improving ImageNet top-1 accuracy by up to 6 p.p. and fine-grained downstream accuracy by up to 7.3 p.p., and even improving transfer when ImageNet accuracy is matched.
Beyond accuracy, training with \schemename substantially improves robustness on standard distribution-shift benchmarks, where we observe gains of roughly $2-19$ p.p. across ViT, Swin, and ResNet architectures.
\subsection*{Contributions}
\begin{itemize}
\item We propose \schemename, a novel data augmentation scheme, that recombines objects and backgrounds. \schemename allows us to move beyond the (possibly biased) image compositions in the dataset while preserving label integrity.
\item We show that training a standard ViT using \schemename leads to up to 4.5 p.p. improved accuracy on ImageNet-1k and 7.3 p.p. on downstream tasks.
\item We propose novel \schemename-based metrics to analyze and quantify fine-grained biases of trained models: Background Robustness, Foreground Focus, Center Bias, and Size Bias. We show that \schemename significantly reduces these biases by encoding invariance that benefits ViT into the training data.
Finally, the same control knobs enable \schemename to become a targeted diagnostic tool of shortcut reliance and model robustness.
We quantify background reliance via controlled background swaps, and probe center and size biases through systematic position and scale sweeps, showing that training with \schemename reduces model biases.
\medskip
\noindent
\textbf{Contributions}
\begin{itemize}[topsep=0pt]
\item \textbf{Controlled composition augmentation for classification.}
We introduce \schemename, a foreground-background factorization and recombination scheme for image classification that creates label-preserving training samples with explicit control over background identity, object position, and object scale.
\item \textbf{Accuracy and transfer gains.}
Training with \schemename, in addition to standard strong augmentation pipelines, improves ImageNet top-1 accuracy by up to 6 p.p., boosts fine-grained downstream accuracy by up to 7.3 p.p. and increases accuracy on shifted distributions by up to $19$ p.p.
\item \textbf{Controlled bias diagnostics and mitigation.}
Using the same controls during evaluation, we measure background reliance, foreground focus, and position/scale biases through targeted distribution shifts.
\schemename systematically reduces shortcut behaviors and model biases.
\end{itemize}

73
sec/intro_old.tex Normal file
View File

@@ -0,0 +1,73 @@
% !TeX root = ../main.tex
\section{Introduction}
\label{sec:intro}
% \begin{itemize}
% \item General Into Image classification
% \item ImageNet
% \item CNNs $\to$ Transformers
% \item Traditional Data Augmentation: CNNs
% \item Problems with that: Other model properties of Transformers
% \item Our approach: Recombining ImageNet forgrounds and backgrounds
% \end{itemize}
\begin{figure}
\centering
\includegraphics[width=.5\columnwidth]{img/fig-1.pdf}
\caption{Comparison of traditional image classification training and training when using \schemename. \schemename recombines foreground objects with different backgrounds each epoch, thus creating a more diverse training set. We still apply strong traditional data augmentation afterwards.}
\label{fig:fig-1}
\end{figure}
Image classification, a fundamental task in computer vision (CV), involves assigning labels to images from a set of categories.
It underpins a wide range of applications, like medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2023a}, and object recognition~\cite{Carion2020,He2017,Girshick2014} and facilitates large-scale pretraining~\cite{Dosovitskiy2021,Liu2021,Touvron2021b}, and progress evaluation in CV~\cite{Khan2022, Rangel2024}.
% Furthermore, image classification is used for large-scale pretraining of vision models~\cite{Dosovitskiy2021,Liu2021,Touvron2021b} and to judge the progress of the field of CV \cite{Khan2022, Rangel2024}.
The advent of large-scale datasets, particularly ImageNet~\cite{Deng2009}, served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}.
% containing millions of labeled images across thousands of categories, has been instrumental in driving significant progress in this field.
% ImageNet served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}.
% It is used to train and evaluate the best models in the field.
While traditionally, convolutional neural networks (CNNs) have been the go-to architecture in CV, Transformers \cite{Vaswani2017}, particularly the Vision Transformer (ViT) \cite{Dosovitskiy2021}, have emerged as a powerful alternative and go-to architecture, demonstrating
% These attention-based models have demonstrated
superior performance in various vision tasks, including image classification \cite{Wortsman2022,Yu2022,Carion2020,Zong2023,Wang2023b}.
Data augmentation is a key technique for training image classification models.
% A key technique for training image classification models, especially with limited data, is data augmentation.
Traditional augmentation methods, such as cropping, flipping, or color shifts, are commonly employed to increase data diversity~\cite{Xu2023d, Shorten2019}, but remain bound to existing image compositions.
While these preserve the images' semantic meaning, their ability to teach spatial invariances is limited.
% the diversity of the training data and improve the model's performance~\cite{Xu2023d, Shorten2019}.
% These basic transformations, originally designed for CNNs, change the input images in a way that preserves their semantic meaning~\cite{Alomar2023}, but are limited to existing image compositions.
While combinations of these data augmentations are still used today, they originally were proposed to benefit CNNs.
However, the architectural differences of CNNs and Transformers suggest that the latter might benefit from different data augmentation strategies.
In particular, the self-attention mechanism, unlike a CNN, is not translation equivariant~\cite{RojasGomez2023,Ding2023a}, meaning that the model is not designed to understand the spatial relationships between pixels.
% This creates the need for novel data augmentation strategies tailored to the Transformer architecture.
% This fact opens a new design space for data augmentation strategies to help Transformers understand the basic invariances of image classification.
% Note that these traditional data augmentations are also limited by existing image compositions.
Recognizing that Transformers need to learn spatial relationships directly from data,
% and in general are usually trained on larger datasets~\cite{Kolesnikov2020},
we propose \schemename, a data augmentation method that makes these relationships explicit by recombining foreground objects with diverse backgrounds.
Thus, \schemename goes beyond existing image compositions and encodes desired invariances directly into the training data (see \Cref{fig:fig-1}).
% Inspired by this inductive bias of CNNs, that is not inherent to ViTs, we propose \schemename, a novel data augmentation scheme for image classification which makes the translation equivariance of CNNs explicit in the training data by recombining foreground objects at varying positions with different backgrounds.
% In this paper, we address the challenge of effectively training Transformers for image classification by proposing \schemename, a novel data augmentation scheme for image classification, which combines foreground objects with different backgrounds.
% Applying \schemename to ImageNet gives rise to \name, a novel dataset that enables this data augmentation with with fine-grained control over the image composition.
Applying \schemename to a dataset like ImageNet is a two-step process:
(1)~We separate the foreground objects in ImageNet from their backgrounds, using an open-world object detector~\cite{Ren2024} and fill in the background in a neutral way using an object removal model~\cite{Sun2025,Suvorov2022}.
(2)~This allows us to then recombine any foreground object with any background on the fly, creating a highly diverse training set.
% During recombination, we can control important parameters, like the size and position of the foreground object, to help the model learn the spatial invariances necessary for image classification.
By exploiting the control over foreground size and position during recombination, \schemename explicitly teaches spatial invariances that image classification models typically must learn implicitly.
We show that using \schemename additionally to strong traditional data augmentation increases the model accuracy of Transformers by up to 4.5 p.p. on ImageNet and reduces the error rate by up to $7.3$ p.p. in downstream tasks.
Beyond training, \schemename becomes a diagnostic tool for analyzing model behavior and biases, when used during evaluation.
We utilize our control over the image distribution to measure a model's background robustness (by varying the choice of background), foreground focus (by leveraging our knowledge about the placement of the foreground object), center bias (by controlling position), and size bias (by controlling size).
These analyses provide valuable insights into model behavior and biases, which is crucial for model deployment and future robustness optimizations.
We show that training using \schemename significantly reduces all of these biases.
We make our code for \schemename and the output of \schemename's segmentation phase on ImageNet publicly available\footnote{Link will go here.} to facilitate further research.
\subsection*{Contributions}
\begin{itemize}
\item We propose \schemename, a novel data augmentation scheme, that recombines objects and backgrounds. \schemename allows us to move beyond the (possibly biased) image compositions in the dataset while preserving label integrity.
\item We show that training a standard ViT using \schemename leads to up to 4.5 p.p. improved accuracy on ImageNet-1k and 7.3 p.p. on downstream tasks.
\item We propose novel \schemename-based metrics to analyze and quantify fine-grained biases of trained models: Background Robustness, Foreground Focus, Center Bias, and Size Bias. We show that \schemename significantly reduces these biases by encoding invariance that benefits ViT into the training data.
\end{itemize}

View File

@@ -3,11 +3,20 @@
%\begin{figure*}[ht!]
% \centering
% \includegraphics[width=.9\textwidth]{img/fig-2.pdf}
% \caption{Overview of \name. The data creation consists of two stages: (1, offline) Segmentation, where we segment the foreground objects from the background and fill in the background. (2, online) Recombination, where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.}
% \caption{Overview of \name. The data creation consists of two stages: (1, offline) Segmentation, where we segment the foreground objects from the background and fill in the background. (3, online) Recombination, where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.}
% \label{fig:method}
%\end{figure*}
\section{\schemename (Method)}
\begin{figure*}[t]
\centering
\includegraphics[width=\textwidth]{img/fig-2.pdf}
\caption{Overview of \schemename.
We segment the foreground object and inpaint the removed region to obtain a neutral background (Offline, \Cref{sec:segmentation}).
We then paste the foreground onto a sampled background while controlling position and scale, then apply standard strong traditional augmentations (Online, \Cref{sec:recombination}).}
\label{fig:method}
\end{figure*}
\section{\schemename}
\label{sec:method}
% \begin{itemize}
@@ -31,83 +40,98 @@
% We introduce \schemename, a data augmentation scheme designed to enhance Transformer training by explicitly separating and recombining foreground objects and backgrounds.
% \schemename enhances transformer training by explicitly encoding spatial invariances that these need to learn explicitly in the data.
% \schemename involves two stages: Segmentation and Recombination, both visualized in \Cref{fig:method}.
We introduce \schemename, a data augmentation designed to enhance Transformer training by embedding spatial invariances--which Transformers would otherwise need to learn implicitly--directly into the training data.
We introduce \schemename, a data augmentation designed to enhance training by embedding spatial invariances, which Transformers would otherwise need to learn implicitly, directly into the training data.
% It operates by explicitly segmenting and recombining foreground objects and backgrounds.
\schemename comprises two distinct stages: Segmentation and Recombination. Both stages are illustrated in \Cref{fig:method}.
\schemename comprises two distinct stages: Segmentation and Recombination. Both are illustrated in \Cref{fig:method}.
\subsection{Segmentation}
\label{sec:segmentation}
The segmentation stage isolates the foreground objects and their corresponding backgrounds.
% We then fill in the background in a visually plausible way~\cite{Sun2024} using a pretrained object-removal model.
We then fill the background using a pretrained object-removal model, producing visually plausible~\cite{Sun2024}, neutral scenes ready for recombination.
The offline segmentation stage produces reusable assets for recombination.
% The segmentation stage isolates the foreground objects and their corresponding backgrounds.
For each labeled training image, we create a pair $(\mathrm{fg},\mathrm{bg})$ consisting of (\textit{i}) a foreground cut-out $\mathrm{fg}$ with an alpha mask and (\textit{ii}) an inpainted background image $\mathrm{bg}$ where the foreground region has been removed.
This stage is computed once offline and the results are stored for the recombination stage.
First, foreground objects are detected and segmented from their backgrounds using a prompt-based segmentation model to exploit the classification datasets labels.
We use the state-of-the-art Grounded SAM~\cite{Ren2024}, which is based on Grounding DINO~\cite{Liu2023e} and SAM~\cite{Kirillov2023}.
The prompt we use is ``\code{a <class name>, a type of <object category>}'', where \code{<class name>} is the specific name of the objects class as defined by the dataset and \code{<object category>} is a the broader category of the object.
The \code{<object category>} guides the segmentation model towards the correct object in case the \code{<class name>} alone is too specific.
This can be the case with prompts like ``sorrel'' or ``guenon'', where the more general name ``horse'' or ``monkey'' is more helpful.
We derive the \code{<object category>} from the WordNet hierarchy, using the immediate hypernym.
% We iteratively extract up to $n$ foreground masks for each dataset-image, using different more and more general prompts based on the more general synsets of WordNet (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...).
We iteratively extract $n$ foreground masks for each dataset-image, creating prompts by going one hypernym up the WordNet-tree each step (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...).
Masks that are very similar, with a pairwise IoU of at least $0.9$, are merged.
The output is a set of masks delineating the foreground objects and the backgrounds.
\textbf{Generate candidate foreground masks.}
We obtain foreground candidates with Grounded SAM~\cite{Ren2024} (Grounding DINO~\cite{Liu2024a} + SAM~\cite{Kirillov2023}).
We leverage the dataset label by prompting the model with ``\code{a <class name>, a type of <object category>}''.
Here \code{<object category>} is the immediate WordNet hypernym of the class (e.g., ``sorrel'' $\rightarrow$ ``horse''), which improves robustness when the class name is rare or overly specific.
This can be the case with prompts like ``sorrel'' or ``guenon'', where the more general name ``horse'' or ``monkey'' is more ubiquitous.
To increase recall, we generate up to $N=3$ masks per image by iteratively moving one level up the hypernym chain (e.g., ``sorrel'' $\rightarrow$ ``horse'' $\rightarrow$ ``equine'' $\dots$).
We merge near-duplicate masks with pairwise IoU $\ge 0.9$, yielding a small set of $n_i<N$ candidate masks per image $i$.
We select the best mask per image (according to \Cref{eq:filtering-score}) in a later filtering step, described below.
First, an inpainting model that is specifically optimized to remove objects from images, such as LaMa~\cite{Suvorov2021} or Attentive Eraser~\cite{Sun2024}, is used to inpaint the foreground regions in the backgrounds.
Then, to ensure the quality of the foregrounds and the neutral background images, we select a foreground/background pair (for each dataset-image) from the $\leq n$ variants we have extracted and infilled in the previous steps.
Using an ensemble $E$ of six ViT, ResNet, and Swin Transformer models pretrained on the original dataset, we select the foreground/background pair that maximizes foreground performance while minimizing the performance on the background and size of the foreground.
For each model $m \in E$, we predict the score of the ground truth class $c$ on the foreground $\mathrm{fg}$ and background $\mathrm{bg}$ and weigh these with the size $\operatorname{size}(\cdot)$ in number of pixels according to:
% $c$ is the correct foreground class, $\mathrm{fg}$, and $\mathrm{bg}$ are the foreground and background and $\operatorname{size}(\cdot)$ is the size in number of pixels.
\textbf{Create neutral backgrounds via object removal.}
Given a candidate mask, we remove the masked region and inpaint it using an object-removal model (LaMa~\cite{Suvorov2022} or Attentive Eraser~\cite{Sun2025}).
This produces a visually plausible, ``neutral'' candidate background that can be paired with many foregrounds.
For an image $i$ we now have $n_i$ foreground objects, extracted from $i$ by cutting out the masked region, each paired with a background where the same mask has been infilled.
\textbf{Select a high-quality pair.}
Different masks can trade off including the full object versus leaking class cues into the background.
We therefore score each candidate pair using an ensemble $E$ of six pretrained classifiers (ViT/ResNet/Swin) trained on the original dataset.
Intuitively, we prefer (\textit{i}) foregrounds that strongly support the ground-truth class and (\textit{ii}) backgrounds that do \emph{not} support the ground-truth class, while (\textit{iii}) discouraging overly large foreground regions.
For each model $m \in E$, we compute the class scores of the ground truth class $c$, $\P[m(\mathrm{fg})=c]$ on the foreground (with solid-gray background) and $\P[m(\mathrm{bg})=c]$ on the background and combine them with a prior $\operatorname{size}(\cdot)$ (pixel count):
\begin{align} \begin{split} \label{eq:filtering-score}
\text{score}(\mathrm{fg}, \mathrm{bg}, c) &= \log \left( \frac{1}{\abs{E}} \sum_{m \in E} \P[m(\mathrm{fg}) = c] \right) \\
& + \log \left( 1 - \frac{1}{\abs E} \sum_{m \in E} \P[m(\mathrm{bg}) = c] \right) \\
\text{score}(\mathrm{fg}, \mathrm{bg}, c) &= \log \left( \sum_{m \in E} \frac{\P[m(\mathrm{fg}) = c]}{\abs{E}} \right)
+ \log \left( 1 - \sum_{m \in E} \frac{\P[m(\mathrm{bg}) = c]}{\abs E} \right) \\
& + \lambda \log \left( 1 - \abs{\frac{\operatorname{size}(\mathrm{fg})}{\operatorname{size}(\mathrm{bg})} - \eps} \right).
\end{split} \end{align}
% We use $E$ is the ensemble of models and $m$ is a pretrained model, $c$ is the correct foreground class, $\mathrm{fg}$, and $\mathrm{bg}$ are the foreground and background and $\operatorname{size}(\cdot)$ is the size in number of pixels.
% We set $\lambda = 2$ and $\eps = 0.1$ via a small hyperparameter search on a manually annotated subset.
We run a hyperparameter search using a manually annotated subset of foreground/background variants to find the factors in \Cref{eq:filtering-score}: $\lambda = 2$ and $\eps = 0.1$.
% The \textit{optimal foreground size} of $10\%$ of the full image balances the smallest possible foreground size that encompasses all the respective class information in the image with still conveying the foreground information after pasting it onto another background.
% This filtering step ensures we segment all the relevant foreground objects.
For each image, we keep the candidate mask with the highest score.
Finally, we filter out backgrounds that are largely infilled, as these tend to be overly synthetic and do not carry much information (see the supplementary material).
% We ablate this choice in \Cref{sec:ablation}.
% While the computational cost for the segmentation stage is significant, this is a one-time calculation whose results can be reused in subsequent experiments (see the supplementary material for details).
Although the segmentation stage is computational overhead, it is a one-time cost with results that can be reused across experiments (see the supplementary material for details).
In summary, we factorize the dataset into a set of foreground objects with a transparent background and a set of diverse backgrounds per class.
The next step is to recombine these, before applying other common data augmentation operations during training.
\textbf{Filter low-quality backgrounds.}
Finally, we discard backgrounds that are heavily ($\geq 80\%$) inpainted, as they tend to look synthetic and provide little useful diversity (see supplementary).
This step filters out $10\%$ of backgrounds.
Although segmentation is the main computational overhead, it is performed once offline and reused across all training runs.
On NVIDIA H100 GPUs, the segmentation stage computes at a rate of $5 338.6 \frac{\text{img}}{\text{GPU} \times \text{h}}$ when inpainting with LaMa.
For ImageNet this comes down to just under $30$ hours on a single node.
At roughly twice the cost of a single ViT-B training run ($\approx 14$ hours), this is a modest investment that is amortized over every subsequent experiment the dataset is used in.
For details see the supplementary material.
% Compare this to $\approx 14$ hours for training ViT-B on ImageNet once.
The output of the segmentation stage is a collection of foreground cut-outs (with transparency) and a pool of diverse, neutral backgrounds, which we use in the online recombination stage.
For ImageNet, we provide pre-computed segmentation output\footnote{\code{URL will go here}}.
\subsection{Recombination}
\label{sec:recombination}
The recombination stage, performed online during training, combines the foreground objects with different backgrounds to create new training samples.
For each object, we follow the pipeline of: Pick an appropriate background, resize it to a fitting size, and place it in the background image.
Through this step, we expose the model to variations beyond the image compositions of the dataset.
In each epoch, the recombination stage generates a recombined training sample for each foreground by (\textit{i}) choosing a background, (\textit{ii}) choosing a target foreground size, (\textit{iii}) sampling a placement, and (\textit{iv}) pasting the foreground using its alpha mask.
This exposes the model to controlled changes in context and spatial layout that are largely absent from standard augmentation.
For each foreground object, we sample a background using one of the following strategies:
(1) the original image background, (2) the set of backgrounds from the same class, or (3) the set of all possible backgrounds.
These sets are trading off the amount of information the model can learn from the background against the diversity of new images created.
In each epoch, each foreground object is seen exactly once, but a background may appear multiple times.
\textbf{Background sampling.}
For each foreground object, we draw a background using one of three increasingly challenging strategies:
(\textit{i}) \textit{Original}: use the object's own inpainted background (no context shift);
(\textit{ii}) \textit{Same-class}: sample a background from the pool of backgrounds belonging to the same class (slight, but plausible context shift);
(\textit{iii}) \textit{All-classes}: sample from the pool of all inpainted backgrounds (large context shift).
These strategies trade off context diversity against semantic plausibility.
We ensure that each foreground is used exactly once per epoch; backgrounds may repeat.
The selected foreground is resized based on its relative size within its original image and the relative size of the original foreground in the selected background image.
The final size is randomly selected from a 30\% range around upper and lower limits ($s_u$ and $s_l$), based on the original sizes.
% \begin{align}
% s \sim \mathcal U \left[ (1 - 0.3) s_l, (1 + 0.3) s_u \right].
% \end{align}
To balance the size of the foreground and that of the backgrounds original foreground, the upper and lower limit $s_u$ and $s_l$ are set to the mean or range of both sizes, depending on the foreground size strategy: \emph{mean} or \emph{range}.
\textbf{Foreground scaling.}
Let $r_{\text{fg}}$ denote the relative foreground area in the source image of the foreground, and $r_{\text{bg}}$ the relative foreground area in the source image of the background. % of the \emph{original} foreground (before inpainting) in the chosen background image.
We compute the lower/upper size limits $(s_l, s_u)$ from these two ratios using one of two variants:
(\textit{i}) \emph{mean} sets $(s_l, s_u)$ using the mean of $r_{\text{fg}}$ and $r_{\text{bg}}$, while
(\textit{ii}) \emph{range} uses the min/max to preserve a wider scale range.
Then, we sample the final scale from a $\pm 30\%$ interval around them and resize the foreground to this scale, while keeping the aspect ratio.
The resized foreground is then placed at a random position within the background image.
To more seamlessly integrate the foreground, we apply a Gaussian blur with ${\sigma \in [\frac{\sigma_{\text{max}}}{10}, \sigma_{\text{max}}]}$, inspired by the standard range for the Gaussian blur operation in \cite{Touvron2022}, to the foreground's alpha-mask.
We can apply standard data augmentation techniques in two modes:
Either we apply all augmentations to the recombined image, or we apply the cropping and resizing to the background only and then apply the other augmentations after recombination.
% While for the second mode, the foreground object will always be fully visible, the first mode uses the data augmentations in the same way they would be used for the baseline dataset.
% The second mode ensures the foreground object remains fully visible, while the first mode mirrors standard data augmentation practices.
The first mode mirrors standard augmentation practice, whereas the second one ensures the foreground object remains fully visible.
\textbf{Placement and boundary smoothing.}
We paste the resized foreground at a uniformly random location within the background.
To reduce cut-and-paste artifacts, we slightly soften the alpha mask boundary by applying a Gaussian blur with $\sigma \in [\frac{\sigma_{\text{max}}}{10}, \sigma_{\text{max}}]$, following the range used in modern augmentation~\cite{Touvron2022}.
% For example recombined images see \Cref{tab:foraug-examples}.
We experiment with a constant mixing ratio, or a linear or cosine annealing schedule that increases the amount of images from the original dataset over time.
The mixing ratio acts as a probability of selecting an image from the original dataset;
otherwise, an image with the same foreground is recombined using \schemename, ensuring each object is seen once per epoch.
% Thus, we still ensure each foreground is seen once per epoch.
The recombination stage is designed to be parallelized on the CPU during training and thus does not impact training time (see supplementary material for details).
% \textbf{Interaction with standard augmentation.}
% We support two augmentation orders:
% (\textit{i}) apply the full augmentation pipeline after recombination; or
% (\textit{ii}) apply crop+resize to the background first (to keep the full foreground visible), then recombine, then apply the remaining augmentations.
% The former matches standard training exactly; the latter isolates composition changes from random cropping.
\textbf{Mixing with original images.}
We optionally mix recombined samples with unmodified dataset images.
A mixing ratio $p$ acts as the probability of drawing the original image; otherwise we use its foreground and apply \schemename.
We consider constant $p$ as well as linear/cosine schedules that increase $p$ over training.
Finally, we apply standard data augmentation techniques on the resulting images.
The online recombination is CPU-parallel and does not measurably increase training time.
We find a $\approx 1\%$ increase in average step-time (see supplementary).

120
sec/method_old.tex Normal file
View File

@@ -0,0 +1,120 @@
% !TeX root = ../main.tex
%\begin{figure*}[ht!]
% \centering
% \includegraphics[width=.9\textwidth]{img/fig-2.pdf}
% \caption{Overview of \name. The data creation consists of two stages: (1, offline) Segmentation, where we segment the foreground objects from the background and fill in the background. (2, online) Recombination, where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.}
% \label{fig:method}
%\end{figure*}
\begin{figure*}[t]
\centering
\includegraphics[width=\textwidth]{img/fig-2.pdf}
\caption{Overview of \schemename. The data creation consists of two stages: Segmentation (offline, \Cref{sec:segmentation}), where we segment the foreground objects from the background and fill in the background. Recombination (online, \Cref{sec:recombination}), where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.}
\label{fig:method}
\end{figure*}
\section{\schemename (Method)}
\label{sec:method}
% \begin{itemize}
% \item[1.] Segment ImageNet
% \item Detect and Cutout Foreground
% \item Multiple foreground possibilities
% \item Foreground mask merging
% \item Background infills
% \item Foreground/Background Filtering
% \item [2.] Recombination
% \item Which foreground \& Background
% \item Background pruning
% \item size
% \item positioning
% \item Border smoothing
% \item Dealing with other data augmentations/transformations
% \end{itemize}
% We propose a novel dataset, called \name, that improves image classification performance by explicitly separating and recombining foreground objects and plain backgrounds.
% \name consists of two stages: Segmentation and recombination. Both are visualized in \Cref{fig:method}.
% We introduce \schemename, a data augmentation scheme designed to enhance Transformer training by explicitly separating and recombining foreground objects and backgrounds.
% \schemename enhances transformer training by explicitly encoding spatial invariances that these need to learn explicitly in the data.
% \schemename involves two stages: Segmentation and Recombination, both visualized in \Cref{fig:method}.
We introduce \schemename, a data augmentation designed to enhance Transformer training by embedding spatial invariances--which Transformers would otherwise need to learn implicitly--directly into the training data.
% It operates by explicitly segmenting and recombining foreground objects and backgrounds.
\schemename comprises two distinct stages: Segmentation and Recombination. Both are illustrated in \Cref{fig:method}.
\subsection{Segmentation}
\label{sec:segmentation}
The segmentation stage isolates the foreground objects and their corresponding backgrounds.
% We then fill in the background in a visually plausible way~\cite{Sun2025} using a pretrained object-removal model.
We then fill the background using a pretrained object-removal model, producing visually plausible~\cite{Sun2025}, neutral scenes ready for recombination.
This stage is computed once offline and the results are stored for the recombination stage.
First, foreground objects are detected and segmented from their backgrounds using a prompt-based segmentation model to exploit the classification datasets labels.
We use the state-of-the-art Grounded SAM~\cite{Ren2024}, which is based on Grounding DINO~\cite{Liu2024a} and SAM~\cite{Kirillov2023}.
The prompt we use is ``\code{a <class name>, a type of <object category>}'', where \code{<class name>} is the specific name of the objects class as defined by the dataset and \code{<object category>} is a the broader category of the object.
The \code{<object category>} guides the segmentation model towards the correct object in case the \code{<class name>} alone is too specific.
This can be the case with prompts like ``sorrel'' or ``guenon'', where the more general name ``horse'' or ``monkey'' is more helpful.
We derive the \code{<object category>} from the WordNet hierarchy, using the immediate hypernym.
% We iteratively extract up to $n$ foreground masks for each dataset-image, using different more and more general prompts based on the more general synsets of WordNet (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...).
We iteratively extract $n$ foreground masks for each dataset-image, creating prompts by going one hypernym up the WordNet-tree each step (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...).
Masks that are very similar, with a pairwise IoU of at least $0.9$, are merged.
The output is a set of masks delineating the foreground objects and the backgrounds.
We select the best mask per image (according to \Cref{eq:filtering-score}) in a later filtering step, described below.
First, an inpainting model that is specifically optimized to remove objects from images, such as LaMa~\cite{Suvorov2022} or Attentive Eraser~\cite{Sun2025}, is used to inpaint the foreground regions in the backgrounds.
Then, to ensure the quality of the foregrounds and the neutral background images, we select a foreground/background pair (for each dataset-image) from the $\leq n$ variants we have extracted and infilled in the previous steps.
Using an ensemble $E$ of six ViT, ResNet, and Swin Transformer models pretrained on the original dataset, we select the foreground/background pair that maximizes foreground performance while minimizing the performance on the background and size of the foreground.
For each model $m \in E$, we predict the score of the ground truth class $c$ on the foreground $\mathrm{fg}$ and background $\mathrm{bg}$ and weigh these with the size $\operatorname{size}(\cdot)$ in number of pixels according to:
% $c$ is the correct foreground class, $\mathrm{fg}$, and $\mathrm{bg}$ are the foreground and background and $\operatorname{size}(\cdot)$ is the size in number of pixels.
\begin{align} \begin{split} \label{eq:filtering-score}
\text{score}(\mathrm{fg}, \mathrm{bg}, c) &= \log \left( \sum_{m \in E} \frac{\P[m(\mathrm{fg}) = c]}{\abs{E}} \right)
+ \log \left( 1 - \sum_{m \in E} \frac{\P[m(\mathrm{bg}) = c]}{\abs E} \right) \\
& + \lambda \log \left( 1 - \abs{\frac{\operatorname{size}(\mathrm{fg})}{\operatorname{size}(\mathrm{bg})} - \eps} \right).
\end{split} \end{align}
% We use $E$ is the ensemble of models and $m$ is a pretrained model, $c$ is the correct foreground class, $\mathrm{fg}$, and $\mathrm{bg}$ are the foreground and background and $\operatorname{size}(\cdot)$ is the size in number of pixels.
We run a hyperparameter search using a manually annotated subset of foreground/background variants to find the factors in \Cref{eq:filtering-score}: $\lambda = 2$ and $\eps = 0.1$.
% The \textit{optimal foreground size} of $10\%$ of the full image balances the smallest possible foreground size that encompasses all the respective class information in the image with still conveying the foreground information after pasting it onto another background.
% This filtering step ensures we segment all the relevant foreground objects.
Finally, we filter out backgrounds that are largely infilled, as these tend to be overly synthetic and do not carry much information (see the supplementary material).
% We ablate this choice in \Cref{sec:ablation}.
% While the computational cost for the segmentation stage is significant, this is a one-time calculation whose results can be reused in subsequent experiments (see the supplementary material for details).
Although the segmentation stage is computational overhead, it is a one-time cost with results that can be reused across experiments (see the supplementary material for details).
In summary, we factorize the dataset into a set of foreground objects with a transparent background and a set of diverse backgrounds per class.
The next step is to recombine these, before applying other common data augmentation operations during training.
\subsection{Recombination}
\label{sec:recombination}
The recombination stage, performed online during training, combines the foreground objects with different backgrounds to create new training samples.
For each object, we follow the pipeline of: Pick an appropriate background, resize it to a fitting size, and place it in the background image.
Through this step, we expose the model to variations beyond the image compositions of the dataset.
For each foreground object, we sample a background using one of the following strategies:
(1) the original image background, (2) the set of backgrounds from the same class, or (3) the set of all possible backgrounds.
These sets are trading off the amount of information the model can learn from the background against the diversity of new images created.
In each epoch, each foreground object is seen exactly once, but a background may appear multiple times.
The selected foreground is resized based on its relative size within its original image and the relative size of the original foreground in the selected background image.
The final size is randomly selected from a 30\% range around upper and lower limits ($s_u$ and $s_l$), based on the original sizes.
% \begin{align}
% s \sim \mathcal U \left[ (1 - 0.3) s_l, (1 + 0.3) s_u \right].
% \end{align}
To balance the size of the foreground and that of the backgrounds original foreground, the upper and lower limit $s_u$ and $s_l$ are set to the mean or range of both sizes, depending on the foreground size strategy: \emph{mean} or \emph{range}.
The resized foreground is then placed at a random position within the background image.
To more seamlessly integrate the foreground, we apply a Gaussian blur with ${\sigma \in [\frac{\sigma_{\text{max}}}{10}, \sigma_{\text{max}}]}$, inspired by the standard range for the Gaussian blur operation in \cite{Touvron2022}, to the foreground's alpha-mask.
We can apply standard data augmentation techniques in two modes:
Either we apply all augmentations to the recombined image, or we apply the cropping and resizing to the background only and then apply the other augmentations after recombination.
% While for the second mode, the foreground object will always be fully visible, the first mode uses the data augmentations in the same way they would be used for the baseline dataset.
% The second mode ensures the foreground object remains fully visible, while the first mode mirrors standard data augmentation practices.
The first mode mirrors standard augmentation practice, whereas the second one ensures the foreground object remains fully visible.
We experiment with a constant mixing ratio, or a linear or cosine annealing schedule that increases the amount of images from the original dataset over time.
The mixing ratio acts as a probability of selecting an image from the original dataset;
otherwise, an image with the same foreground is recombined using \schemename, ensuring each object is seen once per epoch.
% Thus, we still ensure each foreground is seen once per epoch.
The recombination stage is designed to be parallelized on the CPU during training and thus does not impact training time (see supplementary material for details).

View File

@@ -3,44 +3,44 @@
\section{Related Work}
\label{sec:related_work}
\paragraph{Data Augmentation for Image Classification}
Data augmentation is a crucial technique for improving the performance and generalization of image classification models.
Traditional augmentation strategies rely on simple geometric or color-space transformations like cropping, flipping, roatation, blurring, color jittering, or random erasing \cite{Zhong2017} to increase the diversity of the training data without changing their semantic meaning.
With the advent of Vision Transformers, new data augmentation operations like PatchDropout \cite{Liu2022d} have been proposed.
Other transformations like Mixup \cite{Zhang2018a}, CutMix \cite{Yun2019}, or random cropping and patching \cite{Takahashi2018} combine multiple input images.
These simple transformations are usually bundled to form more complex augmentation policies like AutoAugment \cite{Cubuk2018} and RandAugment \cite{Cubuk2019},
% which automatically search for optimal augmentation policies
or 3-augment \cite{Touvron2022} which is optimized to train a ViT.
For a general overview of data augmentation techniques for image classification, we refer to \citet{Shorten2019, Xu2023d}.
\textbf{Data Augmentation for Image Classification.}
Data augmentation is a crucial technique for improving the model performance and generalization.
Traditional augmentation strategies rely on simple geometric or color-space transformations like cropping, flipping, rotation, blurring, color jittering, or random erasing~\cite{Zhong2020} to increase training data diversity without changing the semantic meaning.
With the advent of ViTs~\cite{Dosovitskiy2021}, new data augmentation operations like PatchDropout~\cite{Liu2022d} have been proposed.
Other transformations like MixUp~\cite{Zhang2018a}, CutMix~\cite{Yun2019}, or random cropping and patching~\cite{Takahashi2018} combine multiple input images.
These simple transformations are usually bundled to form more complex augmentation policies like AutoAugment~\cite{Cubuk2019} and RandAugment~\cite{Cubuk2020}, or 3-Augment~\cite{Touvron2022}. %, which is optimized to train a ViT.
For a general overview of data augmentation for image classification, we refer to Shorten et al.~\cite{Shorten2019} and Xu et al.~\cite{Xu2023d}.
We build upon these general augmentations by introducing a novel approach to explicitly separate objects and backgrounds for image classification, allowing us to -- unlike these basic transformations -- move beyond dataset image compositions.
Our approach is used additionally to strong traditional techniques to improve performance and reduce biases.
We advance these general augmentations by introducing \schemename to explicitly separate objects and backgrounds for image classification, allowing us to move beyond image compositions from the dataset.
Thus, \schemename unlocks performance improvements and bias reduction not possible with traditional data augmentation.
% \schemename is used additionally to traditional augmentation techniques to improve performance and reduce biases.
\paragraph{Copy-Paste Augmentation}
The copy-paste augmentation \cite{Ghiasi2020}, which is used only for object detection \cite{Shermaine2025,Ghiasi2020} and instance segmentation \cite{Werman2021,Ling2022}, involves copying segmented objects from one image and pasting them onto another.
While typically human annotated segmentation masks are used to extract the foreground objects, other foregound sources have been explored, like 3D models \cite{Hinterstoisser2019} and pretrained object-detection models for use on objects on white background \cite{Dwibedi2017} or synthetic images \cite{Ge2023}.
% DeePaste \cite{Werman2021} focuses on using inpainting for a more seamless integration of the pasted object.
\cite{Kang2022} apply copy-paste as an alternative to CutMix in image classification, but they do not shift the size or position of the foregrounds and use normal dataset images as backgrounds.
\textbf{Copy-Paste Augmentation.}
The copy-paste augmentation~\cite{Ghiasi2021}, which is used only for object detection~\cite{Shermaine2025,Ghiasi2021} and instance segmentation~\cite{Werman2022,Ling2022}, involves copying segmented objects from one image and pasting them onto another.
While typically human annotated segmentation masks are used to extract the foreground objects, other foreground sources have been explored, like 3D models~\cite{Hinterstoisser2019} and pretrained object-detection models for use on objects on white background~\cite{Dwibedi2017} or synthetic images~\cite{Ge2023}.
Kang et al.~\cite{Kang2022} apply copy-paste as an alternative to CutMix in image classification, but they do not shift the size or position of the foregrounds and use dataset images (with object) as backgrounds.
% Unlike these methods, \schemename focuses on image classification.
% While these methods paste objects onto another image (with a different foreground) or on available or rendered background images of the target scene, we extract foreground objects and fill in the resulting holes in the background in a semantically neutral way.
Unlike prior copy-paste methods that overlay objects, \schemename extracts foregrounds and replaces their backgrounds with semantically neutral fills, thereby preserving label integrity while enabling controlled and diverse recombination.
% This way, we are preserving label integrity while also having diverse, neutral backgrounds available for recombination, enabling a controlled and diverse manipulation of image composition.
\begin{figure*}[ht!]
\centering
\includegraphics[width=.9\textwidth]{img/fig-2.pdf}
\caption{Overview of \schemename. The data creation consists of two stages: Segmentation (offline, \Cref{sec:segmentation}), where we segment the foreground objects from the background and fill in the background. Recombination (online, \Cref{sec:recombination}), where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.}
\label{fig:method}
\end{figure*}
\textbf{Generative data augmentation.}
Recent work uses generative models to synthesize additional training images, e.g., via GANs or diffusion models driven by text prompts or attribute labels~\cite{Lu2022,Trabucco2024,Islam2024}.
Concurrently to our work, AGA~\cite{Rahat2025} combines LLMs, diffusion models, and segmentation to generate fully synthetic backgrounds from text prompts, onto which real foregrounds are pasted.
These synthetic images are appended to the original training set.
\paragraph{Model robustness evaluation}
While AGA focuses on increasing diversity via prompt-driven background synthesis, \schemename uses generative models differently:
We apply inpainting only to locally neutralize the original object region, yielding semi-synthetic backgrounds that preserve the global layout, style, and characteristics of real dataset images.
% AGA's focus on synthetic background is likely to produce a shifted, or even collapsed background image distribution~\cite{Zverev2025,Shumailov2024,Adamkiewicz2026}.
Fully synthetic, prompt-generated backgrounds are likely to change, the effective background distribution, especially when prompts or generators are biased~\cite{Zverev2025,Shumailov2024,Adamkiewicz2026}.
We then do online recombination of real foregrounds with these neutralized, dataset-consistent backgrounds under explicit control of object position and scale.
Thus, \schemename acts as a dynamic large-scale augmentation method while AGA is statically expanding small-scale training sets with synthetic data.
\textbf{Model robustness evaluation.}
Evaluating model robustness to various image variations is critical for understanding and improving model generalization.
Datasets like ImageNet-C \cite{Hendrycks2019} and ImageNet-P \cite{Hendrycks2019} introduce common corruptions and perturbations.
ImageNet-E \cite{Li2023e} evaluates model robustness against a collection of distribution shifts.
Other datasets, such as ImageNet-D \cite{Zhang2024f}, focus on varying background, texture, and material, but rely on synthetic data.
Stylized ImageNet \cite{Geirhos2018} investigates the impact of texture changes.
ImageNet-9 \cite{Xiao2020} explores background variations using segmented images, but backgrounds are often artificial.
Datasets like ImageNet-A~\cite{Hendrycks2021}, ImageNet-C~\cite{Hendrycks2019} and ImageNet-P~\cite{Hendrycks2019} introduce common corruptions and perturbations.
ImageNet-E~\cite{Li2023e} evaluates model robustness against a collection of distribution shifts.
Other datasets, such as ImageNet-D~\cite{Zhang2024f} and ImageNet-R~\cite{Hendrycks2021a}, focus on varying background, texture, and material, but rely on synthetic data.
Stylized ImageNet~\cite{Geirhos2019} investigates the impact of texture changes.
ImageNet-9~\cite{Xiao2020} explores background variations using segmented images for a 9-class subset of ImageNet with artificial backgrounds.
In contrast to these existing datasets, which are used only for evaluation, \schemename provides fine-grained control over foreground object placement, size, and background selection, enabling a precise and comprehensive analysis of specific model biases within the context of a large-scale, real-world image distribution.
As \schemename also provides controllable training set generation, it goes beyond simply measuring robustness to actively improving it through training.
As \schemename also provides controllable training data generation, it goes beyond simply measuring robustness to actively improving it through training.

View File

@@ -1,21 +0,0 @@
\backcite {Bates1955}{{1}{A}{figure.caption.1}}
\backcite {Jonhson1995}{{1}{A}{figure.caption.1}}
\backcite {You2020}{{2}{1}{table.caption.4}}
\backcite {Touvron2022}{{2}{1}{table.caption.4}}
\backcite {Touvron2021b}{{2}{1}{table.caption.4}}
\backcite {Yun2019}{{2}{1}{table.caption.4}}
\backcite {Zhong2017}{{2}{1}{table.caption.4}}
\backcite {Cubuk2019}{{2}{1}{table.caption.4}}
\backcite {Zhang2018a}{{2}{1}{table.caption.4}}
\backcite {Yun2019}{{2}{1}{table.caption.4}}
\backcite {Nauen2025}{{3}{C}{table.caption.5}}
\backcite {Touvron2022}{{3}{C}{table.caption.5}}
\backcite {Touvron2021b}{{3}{C}{table.caption.5}}
\backcite {Nauen2025}{{3}{C}{table.caption.5}}
\backcite {Paszke2019}{{3}{C}{table.caption.5}}
\backcite {Wightman2019}{{3}{C}{table.caption.5}}
\backcite {Deng2009}{{3}{D}{table.caption.7}}
\backcite {Suvorov2021}{{5}{E}{table.caption.8}}
\backcite {Sun2024}{{5}{E}{table.caption.8}}
\backcite {Ren2024}{{7}{2}{figure.caption.10}}
\backcite {Ren2024}{{7}{F}{figure.caption.10}}

Binary file not shown.

View File

@@ -1,67 +0,0 @@
% CVPR 2026 Paper Template; see https://github.com/cvpr-org/author-kit
\documentclass[10pt,twocolumn,letterpaper]{article}
%%%%%%%%% PAPER TYPE - PLEASE UPDATE FOR FINAL VERSION
% \usepackage{cvpr} % To produce the CAMERA-READY version
\usepackage[review]{cvpr} % To produce the REVIEW version
% \usepackage[pagenumbers]{cvpr} % To force page numbers, e.g. for an arXiv version
% Import additional packages in the preamble file, before hyperref
\usepackage[pagebackref,breaklinks,colorlinks,allcolors=cvprblue]{hyperref}
\input{packages}
% It is strongly recommended to use hyperref, especially for the review version.
% hyperref with option pagebackref eases the reviewers' job.
% Please disable hyperref *only* if you encounter grave issues,
% e.g. with the file validation for the camera-ready version.
%
% If you comment hyperref and then uncomment it, you should delete *.aux before re-running LaTeX.
% (Or just hit 'q' on the first LaTeX run, let it finish, and you should be clear).
\definecolor{cvprblue}{rgb}{0.21,0.49,0.74}
%%%%%%%%% PAPER ID - PLEASE UPDATE
\def\paperID{Supplementary} % *** Enter the Paper ID here
\def\confName{CVPR}
\def\confYear{2026}
%%%%%%%%% TITLE - PLEASE UPDATE
\newcommand{\name}{\textit{ForNet}\xspace}
\newcommand{\schemename}{\textit{ForAug}\xspace}
\title{\schemename: Mitigating Biases and Improving Vision Transformer Training by Recombining Foregrounds and Backgrounds \\ -- Supplementary Material --}
%%%%%%%%% AUTHORS - PLEASE UPDATE
\author{First Author\\
Institution1\\
Institution1 address\\
{\tt\small firstauthor@i1.org}
% For a paper whose authors are all at the same institution,
% omit the following lines up until the closing ``}''.
% Additional authors and addresses can be added with ``\and'',
% just like the second author.
% To save space, use either the email address or home page, not both
\and
Second Author\\
Institution2\\
First line of institution2 address\\
{\tt\small secondauthor@i2.org}
}
\begin{document}
\onecolumn
\maketitle
\begin{abstract}
This is the supplementary material for the paper: \schemename: Mitigating Biases and Improving Vision Transformer Training by Recombining Foregrounds and Backgrounds
\end{abstract}
\appendix
\input{sec/appendix}
{
\small
\bibliographystyle{ieeenat_fullname}
\bibliography{../JabRef/main_bib}
}
% WARNING: do not forget to delete the supplementary pages from your submission
% \input{sec/X_suppl}
\end{document}

View File

@@ -1,133 +0,0 @@
\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
\usepackage[submission]{aaai2026} % DO NOT CHANGE THIS
\usepackage{times} % DO NOT CHANGE THIS
\usepackage{helvet} % DO NOT CHANGE THIS
\usepackage{courier} % DO NOT CHANGE THIS
\usepackage[hyphens]{url} % DO NOT CHANGE THIS
\usepackage{graphicx} % DO NOT CHANGE THIS
\urlstyle{rm} % DO NOT CHANGE THIS
\def\UrlFont{\rm} % DO NOT CHANGE THIS
\usepackage{natbib} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\frenchspacing % DO NOT CHANGE THIS
\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
%
% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
\usepackage{algorithm}
\usepackage{algorithmic}
%
% These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper.
\usepackage{newfloat}
\usepackage{listings}
\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
\lstset{%
basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace
numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers.
aboveskip=0pt,belowskip=0pt,%
showstringspaces=false,tabsize=2,breaklines=true}
\floatstyle{ruled}
\newfloat{listing}{tb}{lst}{}
\floatname{listing}{Listing}
\input{packages}
%
% Keep the \pdfinfo as shown here. There's no need
% for you to add the /Title and /Author tags.
\pdfinfo{
/TemplateVersion (2026.1)
}
% DISALLOWED PACKAGES
% \usepackage{authblk} -- This package is specifically forbidden
% \usepackage{balance} -- This package is specifically forbidden
% \usepackage{color (if used in text)
% \usepackage{CJK} -- This package is specifically forbidden
% \usepackage{float} -- This package is specifically forbidden
% \usepackage{flushend} -- This package is specifically forbidden
% \usepackage{fontenc} -- This package is specifically forbidden
% \usepackage{fullpage} -- This package is specifically forbidden
% \usepackage{geometry} -- This package is specifically forbidden
% \usepackage{grffile} -- This package is specifically forbidden
% \usepackage{hyperref} -- This package is specifically forbidden
% \usepackage{navigator} -- This package is specifically forbidden
% (or any other package that embeds links such as navigator or hyperref)
% \indentfirst} -- This package is specifically forbidden
% \layout} -- This package is specifically forbidden
% \multicol} -- This package is specifically forbidden
% \nameref} -- This package is specifically forbidden
% \usepackage{savetrees} -- This package is specifically forbidden
% \usepackage{setspace} -- This package is specifically forbidden
% \usepackage{stfloats} -- This package is specifically forbidden
% \usepackage{tabu} -- This package is specifically forbidden
% \usepackage{titlesec} -- This package is specifically forbidden
% \usepackage{tocbibind} -- This package is specifically forbidden
% \usepackage{ulem} -- This package is specifically forbidden
% \usepackage{wrapfig} -- This package is specifically forbidden
% DISALLOWED COMMANDS
% \nocopyright -- Your paper will not be published if you use this command
% \addtolength -- This command may not be used
% \balance -- This command may not be used
% \baselinestretch -- Your paper will not be published if you use this command
% \clearpage -- No page breaks of any kind may be used for the final version of your paper
% \columnsep -- This command may not be used
% \newpage -- No page breaks of any kind may be used for the final version of your paper
% \pagebreak -- No page breaks of any kind may be used for the final version of your paperr
% \pagestyle -- This command may not be used
% \tiny -- This is not an acceptable font size.
% \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference
% \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference
\setcounter{secnumdepth}{0} %May be changed to 1 or 2 if section numbers are desired.
% The file aaai2026.sty is the style file for AAAI Press
% proceedings, working notes, and technical reports.
%
% Title
\newcommand{\name}{\textit{ForNet}\xspace}
\newcommand{\schemename}{\textit{ForAug}\xspace}
% Names: RecombiNet, RecombNet, ReMix, ReMixNet, FoReMix/ForeMix
%%%%%%%%% TITLE - PLEASE UPDATE
\title{\schemename: Recombining Foregrounds and Backgrounds to Improve Vision Transformer Training with Bias Mitigation\\-- Supplementary Material --}
%%%%%%%%% AUTHORS - PLEASE UPDATE
\author {
Tobias Christian Nauen\textsuperscript{\rm 1, \rm 2},
Brian Moser\textsuperscript{\rm 2},
Federico Raue\textsuperscript{\rm 2},
Stanislav Frolov\textsuperscript{\rm 2},
Andreas Dengel\textsuperscript{\rm 1, \rm 2}
}
\affiliations {
\textsuperscript{\rm 1}RPTU Kaiserslautern-Landau, Kaiserslautern, Germany \\
\textsuperscript{\rm 2}German Research Center for Artificial Intelligence (DFKI), Kaiserslautern, Germany \\
{\tt\small first\_second.last@dfki.de / first.last@dfki.de}
}
\begin{document}
\onecolumn
\maketitle
% \input{sec/abstract}
% \input{sec/intro}
% \input{sec/related_work}
% \input{sec/method}
% \input{sec/experiments}
% % \input{sec/future_work}
% \input{sec/conclusion}
% \input{sec/acks}
\begin{abstract}
This is the supplementary material for the paper: \schemename: Recombining Foregrounds and Backgrounds to Improve Vision Transformer Training with Bias Mitigation
\end{abstract}
% \newpage
\appendix
\input{sec/appendix}
\bibliography{../JabRef/main_bib}
\end{document}