diff --git a/arxiv_v2_arXiv/arxiv.zip b/arxiv_v2_arXiv/arxiv.zip new file mode 100644 index 0000000..a134df5 Binary files /dev/null and b/arxiv_v2_arXiv/arxiv.zip differ diff --git a/iccv.sty b/arxiv_v2_arXiv/cvpr.sty similarity index 85% rename from iccv.sty rename to arxiv_v2_arXiv/cvpr.sty index f8f39cd..6ce0bd8 100644 --- a/iccv.sty +++ b/arxiv_v2_arXiv/cvpr.sty @@ -9,7 +9,7 @@ % % use as % \documentclass[times,10pt,twocolumn]{article} -% \usepackage[options]{ICCV} +% \usepackage[options]{cvpr} % \usepackage{times} % % "options" should be replaced by @@ -25,7 +25,7 @@ % --------------------------------------------------------------- \NeedsTeXFormat{LaTeX2e}[1999/12/01] -\ProvidesPackage{iccv}[2025 LaTeX class for IEEE ICCV] +\ProvidesPackage{cvpr}[2026 LaTeX class for IEEE CVPR] \RequirePackage{times} % Integrate Times for here \RequirePackage{xspace} @@ -50,20 +50,20 @@ \RequirePackage[font=footnotesize,skip=3pt,subrefformat=parens]{subcaption} -\newtoggle{iccvfinal} % Camera-ready version -\newtoggle{iccvrebuttal} % Rebuttal -\newtoggle{iccvpagenumbers} % Force page numbers (in camera ready) -\toggletrue{iccvfinal} -\togglefalse{iccvrebuttal} -\togglefalse{iccvpagenumbers} -\DeclareOption{review}{\togglefalse{iccvfinal}\toggletrue{iccvpagenumbers}} -\DeclareOption{rebuttal}{\togglefalse{iccvfinal}\toggletrue{iccvrebuttal}} -\DeclareOption{pagenumbers}{\toggletrue{iccvpagenumbers}} -\DeclareOption*{\PackageWarning{iccv}{Unkown option `\CurrentOption'}} +\newtoggle{cvprfinal} % Camera-ready version +\newtoggle{cvprrebuttal} % Rebuttal +\newtoggle{cvprpagenumbers} % Force page numbers (in camera ready) +\toggletrue{cvprfinal} +\togglefalse{cvprrebuttal} +\togglefalse{cvprpagenumbers} +\DeclareOption{review}{\togglefalse{cvprfinal}\toggletrue{cvprpagenumbers}} +\DeclareOption{rebuttal}{\togglefalse{cvprfinal}\toggletrue{cvprrebuttal}} +\DeclareOption{pagenumbers}{\toggletrue{cvprpagenumbers}} +\DeclareOption*{\PackageWarning{cvpr}{Unkown option `\CurrentOption'}} \ProcessOptions\relax % Don't warn about missing author for rebuttal -\iftoggle{iccvrebuttal}{% +\iftoggle{cvprrebuttal}{% \ActivateWarningFilters[rebuttal] }{} @@ -212,19 +212,19 @@ % --------------------------------------------------------------- -\typeout{ICCV 8.5 x 11-Inch Proceedings Style `iccv.sty'.} +\typeout{CVPR 8.5 x 11-Inch Proceedings Style `cvpr.sty'.} % ten point helvetica bold required for captions % eleven point times bold required for second-order headings % in some sites the name of the fonts may differ, % change the name here: -\font\iccvtenhv = phvb at 8pt % *** IF THIS FAILS, SEE iccv.sty *** +\font\cvprtenhv = phvb at 8pt % *** IF THIS FAILS, SEE cvpr.sty *** \font\elvbf = ptmb scaled 1100 \font\tenbf = ptmb scaled 1000 % If the above lines give an error message, try to comment them and % uncomment these: -%\font\iccvtenhv = phvb7t at 8pt +%\font\cvprtenhv = phvb7t at 8pt %\font\elvbf = ptmb7t scaled 1100 %\font\tenbf = ptmb7t scaled 1000 @@ -241,37 +241,37 @@ % Suppress page numbers when the appropriate option is given -\iftoggle{iccvpagenumbers}{}{% +\iftoggle{cvprpagenumbers}{}{% \pagestyle{empty} } \AtBeginDocument{% % Print an error if document class other than article is used \@ifclassloaded{article}{}{% - \PackageError{iccv}{Package only meant to be used with document class `article'}{Change document class to `article'.} + \PackageError{cvpr}{Package only meant to be used with document class `article'}{Change document class to `article'.} } % Print a warning if incorrect options for article are specified \@ifclasswith{article}{10pt}{}{% - \PackageWarningNoLine{iccv}{Incorrect font size specified - ICCV requires 10-point fonts. Please load document class `article' with `10pt' option} + \PackageWarningNoLine{cvpr}{Incorrect font size specified - CVPR requires 10-point fonts. Please load document class `article' with `10pt' option} } \@ifclasswith{article}{twocolumn}{}{% - \PackageWarningNoLine{iccv}{Single column document - ICCV requires papers to have two-column layout. Please load document class `article' with `twocolumn' option} + \PackageWarningNoLine{cvpr}{Single column document - CVPR requires papers to have two-column layout. Please load document class `article' with `twocolumn' option} } \@ifclasswith{article}{letterpaper}{}{% - \PackageWarningNoLine{iccv}{Incorrect paper size - ICCV uses paper size `letter'. Please load document class `article' with `letterpaper' option} + \PackageWarningNoLine{cvpr}{Incorrect paper size - CVPR uses paper size `letter'. Please load document class `article' with `letterpaper' option} } % Print a warning if hyperref is not loaded and/or if the pagebackref option is missing - \iftoggle{iccvfinal}{% + \iftoggle{cvprfinal}{% \@ifpackageloaded{hyperref}{}{% - \PackageWarningNoLine{iccv}{Package `hyperref' is not loaded, but highly recommended for camera-ready version} + \PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded, but highly recommended for camera-ready version} } }{% \@ifpackageloaded{hyperref}{ \@ifpackagewith{hyperref}{pagebackref}{}{ - \PackageWarningNoLine{iccv}{Package `hyperref' is not loaded with option `pagebackref', which is strongly recommended for review version} + \PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded with option `pagebackref', which is strongly recommended for review version} } }{% - \PackageWarningNoLine{iccv}{Package `hyperref' is not loaded, but strongly recommended for review version} + \PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded, but strongly recommended for review version} } } } @@ -279,19 +279,19 @@ \def\@maketitle{ \newpage \null - \iftoggle{iccvrebuttal}{\vspace*{-.3in}}{\vskip .375in} + \iftoggle{cvprrebuttal}{\vspace*{-.3in}}{\vskip .375in} \begin{center} % smaller title font only for rebuttal - \iftoggle{iccvrebuttal}{{\large \bf \@title \par}}{{\Large \bf \@title \par}} + \iftoggle{cvprrebuttal}{{\large \bf \@title \par}}{{\Large \bf \@title \par}} % additional two empty lines at the end of the title - \iftoggle{iccvrebuttal}{\vspace*{-22pt}}{\vspace*{24pt}}{ + \iftoggle{cvprrebuttal}{\vspace*{-22pt}}{\vspace*{24pt}}{ \large \lineskip .5em \begin{tabular}[t]{c} - \iftoggle{iccvfinal}{ + \iftoggle{cvprfinal}{ \@author }{ - \iftoggle{iccvrebuttal}{}{ + \iftoggle{cvprrebuttal}{}{ Anonymous \confName~submission\\ \vspace*{1pt}\\ Paper ID \paperID @@ -309,7 +309,7 @@ \def\abstract{% % Suppress page numbers when the appropriate option is given - \iftoggle{iccvpagenumbers}{}{% + \iftoggle{cvprpagenumbers}{}{% \thispagestyle{empty} } \centerline{\large\bf Abstract}% @@ -325,26 +325,26 @@ \def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{} % correct heading spacing and type -\def\iccvsection{\@startsection {section}{1}{\z@} +\def\cvprsection{\@startsection {section}{1}{\z@} {-10pt plus -2pt minus -2pt}{7pt} {\large\bf}} -\def\iccvssect#1{\iccvsection*{#1}} -\def\iccvsect#1{\iccvsection{\texorpdfstring{\hskip -1em.~}{}#1}} -\def\section{\@ifstar\iccvssect\iccvsect} +\def\cvprssect#1{\cvprsection*{#1}} +\def\cvprsect#1{\cvprsection{\texorpdfstring{\hskip -1em.~}{}#1}} +\def\section{\@ifstar\cvprssect\cvprsect} -\def\iccvsubsection{\@startsection {subsection}{2}{\z@} +\def\cvprsubsection{\@startsection {subsection}{2}{\z@} {-8pt plus -2pt minus -2pt}{5pt} {\elvbf}} -\def\iccvssubsect#1{\iccvsubsection*{#1}} -\def\iccvsubsect#1{\iccvsubsection{\texorpdfstring{\hskip -1em.~}{}#1}} -\def\subsection{\@ifstar\iccvssubsect\iccvsubsect} +\def\cvprssubsect#1{\cvprsubsection*{#1}} +\def\cvprsubsect#1{\cvprsubsection{\texorpdfstring{\hskip -1em.~}{}#1}} +\def\subsection{\@ifstar\cvprssubsect\cvprsubsect} -\def\iccvsubsubsection{\@startsection {subsubsection}{3}{\z@} +\def\cvprsubsubsection{\@startsection {subsubsection}{3}{\z@} {-6pt plus -2pt minus -2pt}{3pt} {\tenbf}} -\def\iccvssubsubsect#1{\iccvsubsubsection*{#1}} -\def\iccvsubsubsect#1{\iccvsubsubsection{\texorpdfstring{\hskip -1em.~}{}#1}} -\def\subsubsection{\@ifstar\iccvssubsubsect\iccvsubsubsect} +\def\cvprssubsubsect#1{\cvprsubsubsection*{#1}} +\def\cvprsubsubsect#1{\cvprsubsubsection{\texorpdfstring{\hskip -1em.~}{}#1}} +\def\subsubsection{\@ifstar\cvprssubsubsect\cvprsubsubsect} %% --------- Page background marks: Ruler and confidentiality (only for review and rebuttal) -\iftoggle{iccvfinal}{ +\iftoggle{cvprfinal}{ % In review and rebuttal mode, we use the "lineno" package for numbering lines. % When switching to a different mode, the "\@LN" macro may remain in cached .aux files, % leading to build errors (https://github.com/cvpr-org/author-kit/issues/49). @@ -355,9 +355,9 @@ }{ % ----- define vruler \makeatletter - \newbox\iccvrulerbox - \newcount\iccvrulercount - \newdimen\iccvruleroffset + \newbox\cvprrulerbox + \newcount\cvprrulercount + \newdimen\cvprruleroffset \newdimen\cv@lineheight \newdimen\cv@boxheight \newbox\cv@tmpbox @@ -378,8 +378,8 @@ %% Define linenumber setup \RequirePackage[switch,mathlines]{lineno} - % Line numbers in ICCV blue using font from \iccvtenhv - \renewcommand\linenumberfont{\iccvtenhv\color[rgb]{.5,.5,1}} + % Line numbers in CVPR blue using font from \cvprtenhv + \renewcommand\linenumberfont{\cvprtenhv\color[rgb]{.5,.5,1}} \renewcommand\thelinenumber{\fillzeros[3]{\arabic{linenumber}}} @@ -426,7 +426,7 @@ } % \makevruler[][][][][] - \def\iccvruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\iccvrulerbox}} + \def\cvprruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\cvprrulerbox}} \AddToShipoutPicture{% \color[rgb]{.5,.5,1} @@ -436,11 +436,11 @@ \put(\LenToUnit{\textwidth-12pt},\LenToUnit{45pt}){\pid} } \AtTextUpperLeft{%confidential - \put(0,\LenToUnit{1cm}){\parbox{\textwidth}{\centering\iccvtenhv + \put(0,\LenToUnit{1cm}){\parbox{\textwidth}{\centering\cvprtenhv \confName~\confYear~Submission \#\paperID. CONFIDENTIAL REVIEW COPY. DO NOT DISTRIBUTE.}} } } -} % end of not iccvfinal +} % end of not cvprfinal %%% Make figure placement a little more predictable. % We trust the user to move figures if this results diff --git a/arxiv_v2_arXiv/ieeenat_fullname.bst b/arxiv_v2_arXiv/ieeenat_fullname.bst new file mode 100644 index 0000000..261b8c3 --- /dev/null +++ b/arxiv_v2_arXiv/ieeenat_fullname.bst @@ -0,0 +1,1448 @@ +%% File: `abbrvnat.bst' +%% A modification of `abbrv.bst' for use with natbib package +%% +%% Copyright 1993-2007 Patrick W Daly +%% Max-Planck-Institut f\"ur Sonnensystemforschung +%% Max-Planck-Str. 2 +%% D-37191 Katlenburg-Lindau +%% Germany +%% E-mail: daly@mps.mpg.de +%% +%% This program can be redistributed and/or modified under the terms +%% of the LaTeX Project Public License Distributed from CTAN +%% archives in directory macros/latex/base/lppl.txt; either +%% version 1 of the License, or any later version. +%% + % Version and source file information: + % \ProvidesFile{natbst.mbs}[2007/11/26 1.93 (PWD)] + % + % BibTeX `plainnat' family + % version 0.99b for BibTeX versions 0.99a or later, + % for LaTeX versions 2.09 and 2e. + % + % For use with the `natbib.sty' package; emulates the corresponding + % member of the `plain' family, but with author-year citations. + % + % With version 6.0 of `natbib.sty', it may also be used for numerical + % citations, while retaining the commands \citeauthor, \citefullauthor, + % and \citeyear to print the corresponding information. + % + % For version 7.0 of `natbib.sty', the KEY field replaces missing + % authors/editors, and the date is left blank in \bibitem. + % + % Includes field EID for the sequence/citation number of electronic journals + % which is used instead of page numbers. + % + % Includes fields ISBN and ISSN. + % + % Includes field URL for Internet addresses. + % + % Includes field DOI for Digital Object Idenfifiers. + % + % Works best with the url.sty package of Donald Arseneau. + % + % Works with identical authors and year are further sorted by + % citation key, to preserve any natural sequence. + % +ENTRY + { address + author + booktitle + chapter + doi + eid + edition + editor + howpublished + institution + isbn + issn + journal + key + month + note + number + organization + pages + publisher + school + series + title + type + url + volume + year + } + {} + { label extra.label sort.label short.list } + +INTEGERS { output.state before.all mid.sentence after.sentence after.block } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} + +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {new.block.checka} +{ empty$ + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.sentence.checka} +{ empty$ + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {new.sentence.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "\emph{" swap$ * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + +FUNCTION {format.names} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + % Formerly { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't := + { s nameptr "{ff }{vv }{ll}{, jj}" format.name$ 't := + nameptr #1 > + { namesleft #1 > + { ", " * t * } + { numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " et~al." * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {format.key} +{ empty$ + { key field.or.null } + { "" } + if$ +} + +FUNCTION {format.authors} +{ author empty$ + { "" } + { author format.names } + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { editor format.names + editor num.names$ #1 > + { ", editors" * } + { ", editor" * } + if$ + } + if$ +} + +FUNCTION {format.isbn} +{ isbn empty$ + { "" } +% { new.block "ISBN " isbn * } + { "" } + if$ +} + +FUNCTION {format.issn} +{ issn empty$ + { "" } +% { new.block "ISSN " issn * } + { "" } + if$ +} + +FUNCTION {format.url} +{ url empty$ + { "" } +% { new.block "URL \url{" url * "}" * } + { "" } + if$ +} + +FUNCTION {format.doi} +{ doi empty$ + { "" } +% { new.block "\doi{" doi * "}" * } + { "" } + if$ +} + +FUNCTION {format.title} +{ title empty$ + { "" } + { title "t" change.case$ } + if$ +} + +FUNCTION {format.full.names} +{'s := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}" format.name$ 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " et~al." * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {author.editor.full} +{ author empty$ + { editor empty$ + { "" } + { editor format.full.names } + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {author.full} +{ author empty$ + { "" } + { author format.full.names } + if$ +} + +FUNCTION {editor.full} +{ editor empty$ + { "" } + { editor format.full.names } + if$ +} + +FUNCTION {make.full.names} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.full + { type$ "proceedings" = + 'editor.full + 'author.full + if$ + } + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem[" write$ + label write$ + ")" make.full.names duplicate$ short.list = + { pop$ } + { * } + if$ + "]{" * write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {n.dashify} +{ 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {format.date} +{ year duplicate$ empty$ + { "empty year in " cite$ * warning$ + pop$ "" } + 'skip$ + if$ +%% CR: Leave out months. +% month empty$ +% 'skip$ +% { month +% " " * swap$ * +% } +% if$ + extra.label * +} + +FUNCTION {format.btitle} +{ title emphasize +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} + +FUNCTION {format.bvolume} +{ volume empty$ + { "" } +%% CR: Don't show "volume 1234 of LNCS" etc. +% { "volume" volume tie.or.space.connect +% series empty$ +% 'skip$ +% { " of " * series emphasize * } +% if$ +% "volume and number" number either.or.check +% } + { "" } + if$ +} + +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ +%% CR: Leave out series information. +% { series field.or.null } + { "" } + { output.state mid.sentence = + { "number" } + { "Number" } + if$ + number tie.or.space.connect + series empty$ + { "there's a number but no series in " cite$ * warning$ } + { " in " * series * } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition empty$ + { "" } + { output.state mid.sentence = + { edition "l" change.case$ " edition" * } + { edition "t" change.case$ " edition" * } + if$ + } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + { pages multi.page.check + { "pages" pages n.dashify tie.or.space.connect } + { "page" pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.eid} +{ eid empty$ + { "" } + { "art." eid tie.or.space.connect } + if$ +} + +FUNCTION {format.vol.num.pages} +{ volume field.or.null + number empty$ + 'skip$ + { "\penalty0 (" number * ")" * * + volume empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + } + if$ + pages empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.pages } + { ":\penalty0 " * pages n.dashify * } + if$ + } + if$ +} + +FUNCTION {format.vol.num.eid} +{ volume field.or.null + number empty$ + 'skip$ + { "\penalty0 (" number * ")" * * + volume empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + } + if$ + eid empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.eid } + { ":\penalty0 " * eid * } + if$ + } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { "chapter" } + { type "l" change.case$ } + if$ + chapter tie.or.space.connect + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.in.ed.booktitle} +{ booktitle empty$ + { "" } +%% CR: Leave out editors even if the information is available. +% { editor empty$ +% { "In " booktitle emphasize * } +% { "In " format.editors * ", " * booktitle emphasize * } +% if$ +% } + { "In " booktitle emphasize * } + if$ +} + +FUNCTION {empty.misc.check} +{ author empty$ title empty$ howpublished empty$ + month empty$ year empty$ note empty$ + and and and and and + key empty$ not and + { "all relevant fields are empty in " cite$ * warning$ } + 'skip$ + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + type "t" change.case$ + } + if$ +} + +FUNCTION {format.tr.number} +{ type empty$ + { "Technical Report" } + 'type + if$ + number empty$ + { "t" change.case$ } + { number tie.or.space.connect } + if$ +} + +FUNCTION {format.article.crossref} +{ key empty$ + { journal empty$ + { "need key or journal for " cite$ * " to crossref " * crossref * + warning$ + "" + } + { "In \emph{" journal * "}" * } + if$ + } + { "In " } + if$ + " \citet{" * crossref * "}" * +} + +FUNCTION {format.book.crossref} +{ volume empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + "In " + } + { "Volume" volume tie.or.space.connect + " of " * + } + if$ + editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { series empty$ + { "need editor, key, or series for " cite$ * " to crossref " * + crossref * warning$ + "" * + } + { "\emph{" * series * "}" * } + if$ + } + 'skip$ + if$ + } + 'skip$ + if$ + " \citet{" * crossref * "}" * +} + +FUNCTION {format.incoll.inproc.crossref} +{ editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { booktitle empty$ + { "need editor, key, or booktitle for " cite$ * " to crossref " * + crossref * warning$ + "" + } + { "In \emph{" booktitle * "}" * } + if$ + } + { "In " } + if$ + } + { "In " } + if$ + " \citet{" * crossref * "}" * +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + author format.key output + new.block + format.title "title" output.check + new.block + crossref missing$ + { journal emphasize "journal" output.check + eid empty$ + { format.vol.num.pages output } + { format.vol.num.eid output } + if$ + format.date "year" output.check + } + { format.article.crossref output.nonnull + eid empty$ + { format.pages output } + { format.eid output } + if$ + } + if$ + format.issn output + format.doi output + format.url output + new.block + note output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + format.isbn output + format.doi output + format.url output + new.block + note output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + author format.key output + new.block + format.title "title" output.check + howpublished address new.block.checkb + howpublished output + address output + format.date output + format.isbn output + format.doi output + format.url output + new.block + note output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + format.isbn output + format.doi output + format.url output + new.block + note output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + author format.key output + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.chapter.pages output + new.sentence + publisher "publisher" output.check + address output + format.edition output + format.date "year" output.check + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + format.isbn output + format.doi output + format.url output + new.block + note output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + author format.key output + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.pages output + address empty$ + { organization publisher new.sentence.checkb + organization output + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + organization output + publisher output + } + if$ + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + format.isbn output + format.doi output + format.url output + new.block + note output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} +{ output.bibitem + format.authors output + author format.key output + new.block + format.btitle "title" output.check + organization address new.block.checkb + organization output + address output + format.edition output + format.date output + format.url output + new.block + note output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + author format.key output + new.block + format.title "title" output.check + new.block + "Master's thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + format.url output + new.block + note output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + author format.key output + title howpublished new.block.checkb + format.title output + howpublished new.block.checka + howpublished output + format.date output + format.issn output + format.url output + new.block + note output + fin.entry + empty.misc.check +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + author format.key output + new.block + format.btitle "title" output.check + new.block + "PhD thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + format.url output + new.block + note output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + format.editors output + editor format.key output + new.block + format.btitle "title" output.check + format.bvolume output + format.number.series output + address output + format.date "year" output.check + new.sentence + organization output + publisher output + format.isbn output + format.doi output + format.url output + new.block + note output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + author format.key output + new.block + format.title "title" output.check + new.block + format.tr.number output.nonnull + institution "institution" output.check + address output + format.date "year" output.check + format.url output + new.block + note output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + author format.key output + new.block + format.title "title" output.check + new.block + note "note" output.check + format.date output + format.url output + fin.entry +} + +FUNCTION {default.type} { misc } + + +MACRO {jan} {"Jan."} + +MACRO {feb} {"Feb."} + +MACRO {mar} {"Mar."} + +MACRO {apr} {"Apr."} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"Aug."} + +MACRO {sep} {"Sept."} + +MACRO {oct} {"Oct."} + +MACRO {nov} {"Nov."} + +MACRO {dec} {"Dec."} + + + +MACRO {acmcs} {"ACM Comput. Surv."} + +MACRO {acta} {"Acta Inf."} + +MACRO {cacm} {"Commun. ACM"} + +MACRO {ibmjrd} {"IBM J. Res. Dev."} + +MACRO {ibmsj} {"IBM Syst.~J."} + +MACRO {ieeese} {"IEEE Trans. Softw. Eng."} + +MACRO {ieeetc} {"IEEE Trans. Comput."} + +MACRO {ieeetcad} + {"IEEE Trans. Comput.-Aided Design Integrated Circuits"} + +MACRO {ipl} {"Inf. Process. Lett."} + +MACRO {jacm} {"J.~ACM"} + +MACRO {jcss} {"J.~Comput. Syst. Sci."} + +MACRO {scp} {"Sci. Comput. Programming"} + +MACRO {sicomp} {"SIAM J. Comput."} + +MACRO {tocs} {"ACM Trans. Comput. Syst."} + +MACRO {tods} {"ACM Trans. Database Syst."} + +MACRO {tog} {"ACM Trans. Gr."} + +MACRO {toms} {"ACM Trans. Math. Softw."} + +MACRO {toois} {"ACM Trans. Office Inf. Syst."} + +MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."} + +MACRO {tcs} {"Theoretical Comput. Sci."} + + +READ + +FUNCTION {sortify} +{ purify$ + "l" change.case$ +} + +INTEGERS { len } + +FUNCTION {chop.word} +{ 's := + 'len := + s #1 len substring$ = + { s len #1 + global.max$ substring$ } + 's + if$ +} + +FUNCTION {format.lab.names} +{ 's := + s #1 "{vv~}{ll}" format.name$ + s num.names$ duplicate$ + #2 > + { pop$ " et~al." * } + { #2 < + 'skip$ + { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { " et~al." * } + { " and " * s #2 "{vv~}{ll}" format.name$ * } + if$ + } + if$ + } + if$ +} + +FUNCTION {author.key.label} +{ author empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {author.editor.key.label} +{ author empty$ + { editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.lab.names } + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {author.key.organization.label} +{ author empty$ + { key empty$ + { organization empty$ + { cite$ #1 #3 substring$ } + { "The " #4 organization chop.word #3 text.prefix$ } + if$ + } + 'key + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {editor.key.organization.label} +{ editor empty$ + { key empty$ + { organization empty$ + { cite$ #1 #3 substring$ } + { "The " #4 organization chop.word #3 text.prefix$ } + if$ + } + 'key + if$ + } + { editor format.lab.names } + if$ +} + +FUNCTION {calc.short.authors} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.key.label + { type$ "proceedings" = + 'editor.key.organization.label + { type$ "manual" = + 'author.key.organization.label + 'author.key.label + if$ + } + if$ + } + if$ + 'short.list := +} + +FUNCTION {calc.label} +{ calc.short.authors + short.list + "(" + * + year duplicate$ empty$ + short.list key field.or.null = or + { pop$ "" } + 'skip$ + if$ + * + 'label := +} + +FUNCTION {sort.format.names} +{ 's := + #1 'nameptr := + "" + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { + s nameptr "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" format.name$ 't := + nameptr #1 > + { + " " * + namesleft #1 = t "others" = and + { "zzzzz" * } + { numnames #2 > nameptr #2 = and + { "zz" * year field.or.null * " " * } + 'skip$ + if$ + t sortify * + } + if$ + } + { t sortify * } + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {sort.format.title} +{ 't := + "A " #2 + "An " #3 + "The " #4 t chop.word + chop.word + chop.word + sortify + #1 global.max$ substring$ +} + +FUNCTION {author.sort} +{ author empty$ + { key empty$ + { "to sort, need author or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.editor.sort} +{ author empty$ + { editor empty$ + { key empty$ + { "to sort, need author, editor, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.organization.sort} +{ author empty$ + { organization empty$ + { key empty$ + { "to sort, need author, organization, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { "The " #4 organization chop.word sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {editor.organization.sort} +{ editor empty$ + { organization empty$ + { key empty$ + { "to sort, need editor, organization, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { "The " #4 organization chop.word sortify } + if$ + } + { editor sort.format.names } + if$ +} + + +FUNCTION {presort} +{ calc.label + label sortify + " " + * + type$ "book" = + type$ "inbook" = + or + 'author.editor.sort + { type$ "proceedings" = + 'editor.organization.sort + { type$ "manual" = + 'author.organization.sort + 'author.sort + if$ + } + if$ + } + if$ + " " + * + year field.or.null sortify + * + " " + * + cite$ + * + #1 entry.max$ substring$ + 'sort.label := + sort.label * + #1 entry.max$ substring$ + 'sort.key$ := +} + +ITERATE {presort} + +SORT + +STRINGS { longest.label last.label next.extra } + +INTEGERS { longest.label.width last.extra.num number.label } + +FUNCTION {initialize.longest.label} +{ "" 'longest.label := + #0 int.to.chr$ 'last.label := + "" 'next.extra := + #0 'longest.label.width := + #0 'last.extra.num := + #0 'number.label := +} + +FUNCTION {forward.pass} +{ last.label label = + { last.extra.num #1 + 'last.extra.num := + last.extra.num int.to.chr$ 'extra.label := + } + { "a" chr.to.int$ 'last.extra.num := + "" 'extra.label := + label 'last.label := + } + if$ + number.label #1 + 'number.label := +} + +FUNCTION {reverse.pass} +{ next.extra "b" = + { "a" 'extra.label := } + 'skip$ + if$ + extra.label 'next.extra := + extra.label + duplicate$ empty$ + 'skip$ + { "{\natexlab{" swap$ * "}}" * } + if$ + 'extra.label := + label extra.label * 'label := +} + +EXECUTE {initialize.longest.label} + +ITERATE {forward.pass} + +REVERSE {reverse.pass} + +FUNCTION {bib.sort.order} +{ sort.label 'sort.key$ := +} + +ITERATE {bib.sort.order} + +SORT + +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{" number.label int.to.str$ * "}" * + write$ newline$ + "\providecommand{\natexlab}[1]{#1}" + write$ newline$ + "\providecommand{\url}[1]{\texttt{#1}}" + write$ newline$ + "\expandafter\ifx\csname urlstyle\endcsname\relax" + write$ newline$ + " \providecommand{\doi}[1]{doi: #1}\else" + write$ newline$ + " \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi" + write$ newline$ +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} + +EXECUTE {end.bib} diff --git a/arxiv_v2_arXiv/img/DeiT-B_ImageNet_v2.pdf b/arxiv_v2_arXiv/img/DeiT-B_ImageNet_v2.pdf new file mode 100644 index 0000000..ac76a2f Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-B_ImageNet_v2.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-B_ImageNet_v3.pdf b/arxiv_v2_arXiv/img/DeiT-B_ImageNet_v3.pdf new file mode 100644 index 0000000..da18456 --- /dev/null +++ b/arxiv_v2_arXiv/img/DeiT-B_ImageNet_v3.pdf @@ -0,0 +1,71 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMA >Oc%4GN2 b"rR=>}{<~:'0> 72 +pHf +peŗ#~HKD)"ִ -a*Hha/gD_0=UOKձz hbQ8Sn¥YUN9+K"7P#yUIa~[;@C27R-̫paiFpfR*QN9f YϒQpS>lfngH5jEZ-ZRG;J)'vζIfd367\Zem +{5IXl(a\o~ǎ3Qʹ9bwqE#VN9m؁،<__5'58G +endstream +endobj +12 0 obj +420 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073347+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000970 00000 n +0000000865 00000 n +0000000886 00000 n +0000000907 00000 n +0000000928 00000 n +0000000949 00000 n +0000000065 00000 n +0000000330 00000 n +0000000845 00000 n +0000000208 00000 n +0000000825 00000 n +0000001030 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1187 +%%EOF diff --git a/arxiv_v2_arXiv/img/DeiT-B_ImageNet_vNone.pdf b/arxiv_v2_arXiv/img/DeiT-B_ImageNet_vNone.pdf new file mode 100644 index 0000000..28816a9 Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-B_ImageNet_vNone.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-B_fornet_all_cos_v1.pdf b/arxiv_v2_arXiv/img/DeiT-B_fornet_all_cos_v1.pdf new file mode 100644 index 0000000..0abb5b6 Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-B_fornet_all_cos_v1.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-B_fornet_all_cos_v2.pdf b/arxiv_v2_arXiv/img/DeiT-B_fornet_all_cos_v2.pdf new file mode 100644 index 0000000..06cec15 --- /dev/null +++ b/arxiv_v2_arXiv/img/DeiT-B_fornet_all_cos_v2.pdf @@ -0,0 +1,71 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xn0 E +~CJ|nn3q@di^YZ^ç/+<0 @H sx;~b͇{/HKZ2"-ƚliHkdp|="wp%MID4J %E!F +Q#-jL UrC,s+9 CRZ!V](ecQr^ENTH;w"HKM j&n[w˨iۣ$\6V +.ځ}<_/3'n{Qpx^Uªk2µp@sgIN]%;i[gm4{ ROlzXi5~[ 7ʁ]/nؓil8:6Y,~C׺]-?jN>18g #9 +endstream +endobj +12 0 obj +429 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073409+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000979 00000 n +0000000874 00000 n +0000000895 00000 n +0000000916 00000 n +0000000937 00000 n +0000000958 00000 n +0000000065 00000 n +0000000330 00000 n +0000000854 00000 n +0000000208 00000 n +0000000834 00000 n +0000001039 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1196 +%%EOF diff --git a/arxiv_v2_arXiv/img/DeiT-B_fornet_all_cos_v3.pdf b/arxiv_v2_arXiv/img/DeiT-B_fornet_all_cos_v3.pdf new file mode 100644 index 0000000..ddf3280 Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-B_fornet_all_cos_v3.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-L_ImageNet_v1.pdf b/arxiv_v2_arXiv/img/DeiT-L_ImageNet_v1.pdf new file mode 100644 index 0000000..8785a89 Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-L_ImageNet_v1.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-L_ImageNet_v2.pdf b/arxiv_v2_arXiv/img/DeiT-L_ImageNet_v2.pdf new file mode 100644 index 0000000..d20839e Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-L_ImageNet_v2.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-L_ImageNet_v3.pdf b/arxiv_v2_arXiv/img/DeiT-L_ImageNet_v3.pdf new file mode 100644 index 0000000..8a83e3f --- /dev/null +++ b/arxiv_v2_arXiv/img/DeiT-L_ImageNet_v3.pdf @@ -0,0 +1,73 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xn@ E +~Br8|l^];mh +_C-Eq5ç/+<30>!3A2>! +/}m \'6iI,U)+RUH -irG_4% n h L)ܼhambF*a:$m 7Pggw6ޭP2sҢ"b> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073347+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000991 00000 n +0000000886 00000 n +0000000907 00000 n +0000000928 00000 n +0000000949 00000 n +0000000970 00000 n +0000000065 00000 n +0000000330 00000 n +0000000866 00000 n +0000000208 00000 n +0000000846 00000 n +0000001051 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1208 +%%EOF diff --git a/arxiv_v2_arXiv/img/DeiT-L_fornet_all_cos_v1.pdf b/arxiv_v2_arXiv/img/DeiT-L_fornet_all_cos_v1.pdf new file mode 100644 index 0000000..26efb4b Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-L_fornet_all_cos_v1.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-L_fornet_all_cos_v2.pdf b/arxiv_v2_arXiv/img/DeiT-L_fornet_all_cos_v2.pdf new file mode 100644 index 0000000..52779b7 Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-L_fornet_all_cos_v2.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-L_fornet_all_cos_v3.pdf b/arxiv_v2_arXiv/img/DeiT-L_fornet_all_cos_v3.pdf new file mode 100644 index 0000000..3bbace3 Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-L_fornet_all_cos_v3.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-S_ImageNet_v2.pdf b/arxiv_v2_arXiv/img/DeiT-S_ImageNet_v2.pdf new file mode 100644 index 0000000..d1e7762 Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-S_ImageNet_v2.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-S_ImageNet_v3.pdf b/arxiv_v2_arXiv/img/DeiT-S_ImageNet_v3.pdf new file mode 100644 index 0000000..aee0d98 Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-S_ImageNet_v3.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-S_ImageNet_vNone.pdf b/arxiv_v2_arXiv/img/DeiT-S_ImageNet_vNone.pdf new file mode 100644 index 0000000..ea24781 Binary files /dev/null and b/arxiv_v2_arXiv/img/DeiT-S_ImageNet_vNone.pdf differ diff --git a/arxiv_v2_arXiv/img/DeiT-S_fornet_all_linear_v1.pdf b/arxiv_v2_arXiv/img/DeiT-S_fornet_all_linear_v1.pdf new file mode 100644 index 0000000..17d723d --- /dev/null +++ b/arxiv_v2_arXiv/img/DeiT-S_fornet_all_linear_v1.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xAn1 E<O0!%J$M.rvx$$8i7] 8}y˧ǟG$|BȸC#"HxڮШM6_WR,"-{EZrJe|="qQC8rR/fᄕ2;3RRWW eG ᩕ@NPg{ʦErn8fk Nd`VF١ob +zJ8&fIJ .\X2fnϐ^ϣFm#5ɜeI\դRqz&ΟxeN+!-E\HRrg=;J(vtxtNkZ5cM5RxL6:h_سil.9IrRcB.Eޞ6^/m͜Lsp/K +endstream +endobj +12 0 obj +416 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073412+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000966 00000 n +0000000861 00000 n +0000000882 00000 n +0000000903 00000 n +0000000924 00000 n +0000000945 00000 n +0000000065 00000 n +0000000330 00000 n +0000000841 00000 n +0000000208 00000 n +0000000821 00000 n +0000001026 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1183 +%%EOF diff --git a/arxiv_v2_arXiv/img/DeiT-S_fornet_all_linear_v2.pdf b/arxiv_v2_arXiv/img/DeiT-S_fornet_all_linear_v2.pdf new file mode 100644 index 0000000..f0b227f --- /dev/null +++ b/arxiv_v2_arXiv/img/DeiT-S_fornet_all_linear_v2.pdf @@ -0,0 +1,71 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xM0 <O!%"m.zAv)0sB'ARta?h?}ٿz}iz/xD$@2pASvF pd+UKRR"-9e|;"qQM8ŒzaIRqGZTQmGBv +Z ;Tr{B캅fN58W +endstream +endobj +12 0 obj +433 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073412+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000983 00000 n +0000000878 00000 n +0000000899 00000 n +0000000920 00000 n +0000000941 00000 n +0000000962 00000 n +0000000065 00000 n +0000000330 00000 n +0000000858 00000 n +0000000208 00000 n +0000000838 00000 n +0000001043 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1200 +%%EOF diff --git a/arxiv_v2_arXiv/img/DeiT-S_fornet_all_linear_v3.pdf b/arxiv_v2_arXiv/img/DeiT-S_fornet_all_linear_v3.pdf new file mode 100644 index 0000000..be979ae --- /dev/null +++ b/arxiv_v2_arXiv/img/DeiT-S_fornet_all_linear_v3.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xnA E +.ҬMQ:AG&Eq^>~?}9~_ru  2 7`U$|k;~b'/Ht+IHj.\5R\#rgߘ4) nC+Q" 7VT s!(J)rI`,UYMآEIeo+aзbtsEj5U%iQN9]GMݞK؍A*,jV\k,]gn{Z]#u&ꄪ=(av26fήlR S(gVN9ڹ]}vz%)ǚ{j+ۤVN9u؁=Xҙ)fַ8QKrK/?e3 +endstream +endobj +12 0 obj +437 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073413+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000987 00000 n +0000000882 00000 n +0000000903 00000 n +0000000924 00000 n +0000000945 00000 n +0000000966 00000 n +0000000065 00000 n +0000000330 00000 n +0000000862 00000 n +0000000208 00000 n +0000000842 00000 n +0000001047 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1204 +%%EOF diff --git a/arxiv_v2_arXiv/img/ResNet101_ImageNet_v1.pdf b/arxiv_v2_arXiv/img/ResNet101_ImageNet_v1.pdf new file mode 100644 index 0000000..0000c01 Binary files /dev/null and b/arxiv_v2_arXiv/img/ResNet101_ImageNet_v1.pdf differ diff --git a/arxiv_v2_arXiv/img/ResNet101_ImageNet_v2.pdf b/arxiv_v2_arXiv/img/ResNet101_ImageNet_v2.pdf new file mode 100644 index 0000000..06c7992 Binary files /dev/null and b/arxiv_v2_arXiv/img/ResNet101_ImageNet_v2.pdf differ diff --git a/arxiv_v2_arXiv/img/ResNet101_ImageNet_v3.pdf b/arxiv_v2_arXiv/img/ResNet101_ImageNet_v3.pdf new file mode 100644 index 0000000..82752dd --- /dev/null +++ b/arxiv_v2_arXiv/img/ResNet101_ImageNet_v3.pdf @@ -0,0 +1,69 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xKn@ <OC۶m=@` m<;]t!pI?~0> xBX {l3ЌN l}oiIQ sEZ5<"#-53oLWݔjtK3E8-Q)$KinKդ64=!BZG Ɵv:9݊d7\ҹjTS⁾r@wS7ԠjH.`Ek iDqJ)GmTzg諍(yU +=_so ߓtM4h L)Q(,%^jٌ克1՞faeCZ8(yVvQ:,豄IdO8]CzR$.M!^:o.6[oLڐZ]%U}ŭFuo+aRfNg +endstream +endobj +12 0 obj +437 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150045+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000987 00000 n +0000000882 00000 n +0000000903 00000 n +0000000924 00000 n +0000000945 00000 n +0000000966 00000 n +0000000065 00000 n +0000000330 00000 n +0000000862 00000 n +0000000208 00000 n +0000000842 00000 n +0000001047 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1206 +%%EOF diff --git a/img/ResNet101_RecombNetAll_v1.pdf b/arxiv_v2_arXiv/img/ResNet101_RecombNet_all_v1.pdf similarity index 90% rename from img/ResNet101_RecombNetAll_v1.pdf rename to arxiv_v2_arXiv/img/ResNet101_RecombNet_all_v1.pdf index 53ebc28..d2c4d1b 100644 Binary files a/img/ResNet101_RecombNetAll_v1.pdf and b/arxiv_v2_arXiv/img/ResNet101_RecombNet_all_v1.pdf differ diff --git a/img/ResNet101_RecombNetAll_v2.pdf b/arxiv_v2_arXiv/img/ResNet101_RecombNet_all_v2.pdf similarity index 90% rename from img/ResNet101_RecombNetAll_v2.pdf rename to arxiv_v2_arXiv/img/ResNet101_RecombNet_all_v2.pdf index 08e4c7d..74734f4 100644 Binary files a/img/ResNet101_RecombNetAll_v2.pdf and b/arxiv_v2_arXiv/img/ResNet101_RecombNet_all_v2.pdf differ diff --git a/img/ResNet101_RecombNetAll_v3.pdf b/arxiv_v2_arXiv/img/ResNet101_RecombNet_all_v3.pdf similarity index 90% rename from img/ResNet101_RecombNetAll_v3.pdf rename to arxiv_v2_arXiv/img/ResNet101_RecombNet_all_v3.pdf index 8555faa..d97cbcc 100644 --- a/img/ResNet101_RecombNetAll_v3.pdf +++ b/arxiv_v2_arXiv/img/ResNet101_RecombNet_all_v3.pdf @@ -43,9 +43,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094747+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150051+02'00') >> endobj xref 0 14 @@ -66,5 +66,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1208 +1210 %%EOF diff --git a/arxiv_v2_arXiv/img/ResNet50_ImageNet_v1.pdf b/arxiv_v2_arXiv/img/ResNet50_ImageNet_v1.pdf new file mode 100644 index 0000000..32bc24f --- /dev/null +++ b/arxiv_v2_arXiv/img/ResNet50_ImageNet_v1.pdf @@ -0,0 +1,72 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xKn@D}>=m@$8@r`>dB ⼪j<>ǯpzOxF'$d<! +,}m3\'6iq +jEi!MR)RLg<1]ItS(-݄ͥВJ Ғi`uekUN9+uH@nΎ4} -Jj;J),:us1MY*"3*q TE}&rFMmϥmlEC*^uas6SmQ18i+. rTWN9vX7`V$ZZ,Ys}!섣6ۏmM-+[ $izu*R +eSNzu`/>6c}\O-<'dNVb_ ;ү3M +endstream +endobj +12 0 obj +443 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150046+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000993 00000 n +0000000888 00000 n +0000000909 00000 n +0000000930 00000 n +0000000951 00000 n +0000000972 00000 n +0000000065 00000 n +0000000330 00000 n +0000000868 00000 n +0000000208 00000 n +0000000848 00000 n +0000001053 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1212 +%%EOF diff --git a/arxiv_v2_arXiv/img/ResNet50_ImageNet_v2.pdf b/arxiv_v2_arXiv/img/ResNet50_ImageNet_v2.pdf new file mode 100644 index 0000000..06ee1b4 Binary files /dev/null and b/arxiv_v2_arXiv/img/ResNet50_ImageNet_v2.pdf differ diff --git a/arxiv_v2_arXiv/img/ResNet50_ImageNet_v3.pdf b/arxiv_v2_arXiv/img/ResNet50_ImageNet_v3.pdf new file mode 100644 index 0000000..e020acf Binary files /dev/null and b/arxiv_v2_arXiv/img/ResNet50_ImageNet_v3.pdf differ diff --git a/img/ResNet50_RecombNetAll_v1.pdf b/arxiv_v2_arXiv/img/ResNet50_RecombNet_all_v1.pdf similarity index 90% rename from img/ResNet50_RecombNetAll_v1.pdf rename to arxiv_v2_arXiv/img/ResNet50_RecombNet_all_v1.pdf index b2e75c2..88ba2e9 100644 --- a/img/ResNet50_RecombNetAll_v1.pdf +++ b/arxiv_v2_arXiv/img/ResNet50_RecombNet_all_v1.pdf @@ -42,9 +42,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094746+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150050+02'00') >> endobj xref 0 14 @@ -65,5 +65,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1204 +1206 %%EOF diff --git a/img/ResNet50_RecombNetAll_v2.pdf b/arxiv_v2_arXiv/img/ResNet50_RecombNet_all_v2.pdf similarity index 90% rename from img/ResNet50_RecombNetAll_v2.pdf rename to arxiv_v2_arXiv/img/ResNet50_RecombNet_all_v2.pdf index 0f8cc87..d9c8523 100644 Binary files a/img/ResNet50_RecombNetAll_v2.pdf and b/arxiv_v2_arXiv/img/ResNet50_RecombNet_all_v2.pdf differ diff --git a/img/ResNet50_RecombNetAll_v3.pdf b/arxiv_v2_arXiv/img/ResNet50_RecombNet_all_v3.pdf similarity index 90% rename from img/ResNet50_RecombNetAll_v3.pdf rename to arxiv_v2_arXiv/img/ResNet50_RecombNet_all_v3.pdf index efa84ce..2259597 100644 --- a/img/ResNet50_RecombNetAll_v3.pdf +++ b/arxiv_v2_arXiv/img/ResNet50_RecombNet_all_v3.pdf @@ -42,9 +42,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094746+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150050+02'00') >> endobj xref 0 14 @@ -65,5 +65,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1197 +1199 %%EOF diff --git a/arxiv_v2_arXiv/img/Swin-S_ImageNet_v1.pdf b/arxiv_v2_arXiv/img/Swin-S_ImageNet_v1.pdf new file mode 100644 index 0000000..ef09926 --- /dev/null +++ b/arxiv_v2_arXiv/img/Swin-S_ImageNet_v1.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xKn@D}>=ӟm@$8@r`>$B + !>x<~9|_|  2  W`U$|c;^?ɶoGHKr)Ő1$pZ=!w{tMiF4vN@K$GKMU$vaV)BZG Ɵv:9=\iVG*VHv^N\LS\DY M(9pJ)GeԴFZ> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150047+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000988 00000 n +0000000883 00000 n +0000000904 00000 n +0000000925 00000 n +0000000946 00000 n +0000000967 00000 n +0000000065 00000 n +0000000330 00000 n +0000000863 00000 n +0000000208 00000 n +0000000843 00000 n +0000001048 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1207 +%%EOF diff --git a/arxiv_v2_arXiv/img/Swin-S_ImageNet_v2.pdf b/arxiv_v2_arXiv/img/Swin-S_ImageNet_v2.pdf new file mode 100644 index 0000000..cde8932 Binary files /dev/null and b/arxiv_v2_arXiv/img/Swin-S_ImageNet_v2.pdf differ diff --git a/arxiv_v2_arXiv/img/Swin-S_ImageNet_v3.pdf b/arxiv_v2_arXiv/img/Swin-S_ImageNet_v3.pdf new file mode 100644 index 0000000..0d223e5 Binary files /dev/null and b/arxiv_v2_arXiv/img/Swin-S_ImageNet_v3.pdf differ diff --git a/img/Swin-S_RecombNetAll_v1.pdf b/arxiv_v2_arXiv/img/Swin-S_RecombNet_all_v1.pdf similarity index 90% rename from img/Swin-S_RecombNetAll_v1.pdf rename to arxiv_v2_arXiv/img/Swin-S_RecombNet_all_v1.pdf index 99f0442..c3cfae6 100644 Binary files a/img/Swin-S_RecombNetAll_v1.pdf and b/arxiv_v2_arXiv/img/Swin-S_RecombNet_all_v1.pdf differ diff --git a/img/Swin-S_RecombNetAll_v2.pdf b/arxiv_v2_arXiv/img/Swin-S_RecombNet_all_v2.pdf similarity index 90% rename from img/Swin-S_RecombNetAll_v2.pdf rename to arxiv_v2_arXiv/img/Swin-S_RecombNet_all_v2.pdf index 5267a01..2be743b 100644 --- a/img/Swin-S_RecombNetAll_v2.pdf +++ b/arxiv_v2_arXiv/img/Swin-S_RecombNet_all_v2.pdf @@ -42,9 +42,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094745+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150049+02'00') >> endobj xref 0 14 @@ -65,5 +65,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1168 +1170 %%EOF diff --git a/img/Swin-S_RecombNetAll_v3.pdf b/arxiv_v2_arXiv/img/Swin-S_RecombNet_all_v3.pdf similarity index 90% rename from img/Swin-S_RecombNetAll_v3.pdf rename to arxiv_v2_arXiv/img/Swin-S_RecombNet_all_v3.pdf index 2db583f..49c3b06 100644 Binary files a/img/Swin-S_RecombNetAll_v3.pdf and b/arxiv_v2_arXiv/img/Swin-S_RecombNet_all_v3.pdf differ diff --git a/arxiv_v2_arXiv/img/Swin-Ti_ImageNet_v1.pdf b/arxiv_v2_arXiv/img/Swin-Ti_ImageNet_v1.pdf new file mode 100644 index 0000000..171804c Binary files /dev/null and b/arxiv_v2_arXiv/img/Swin-Ti_ImageNet_v1.pdf differ diff --git a/arxiv_v2_arXiv/img/Swin-Ti_ImageNet_v2.pdf b/arxiv_v2_arXiv/img/Swin-Ti_ImageNet_v2.pdf new file mode 100644 index 0000000..7e56d1c --- /dev/null +++ b/arxiv_v2_arXiv/img/Swin-Ti_ImageNet_v2.pdf @@ -0,0 +1,71 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn0 <O"şm[`m=@0641StaȖ=>~t?~/`|'$|F?x@' $\Uzw&oGHKYR"B!*H YQZ 3o WݔjtKchpjI-nHK +iDqqe3pW ;\!C6t0 uvf";qu@Z$JzwSngѩ05NnmTNe^ֳCmdEC[=y\ۯ'n{nT#J++HBE(alidۺ+#jkV1 +>u[ ;z]mĽIrKŖL+i.~[ ;nسil6eŪs?frK4'  +endstream +endobj +12 0 obj +440 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150045+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000990 00000 n +0000000885 00000 n +0000000906 00000 n +0000000927 00000 n +0000000948 00000 n +0000000969 00000 n +0000000065 00000 n +0000000330 00000 n +0000000865 00000 n +0000000208 00000 n +0000000845 00000 n +0000001050 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1209 +%%EOF diff --git a/arxiv_v2_arXiv/img/Swin-Ti_ImageNet_v3.pdf b/arxiv_v2_arXiv/img/Swin-Ti_ImageNet_v3.pdf new file mode 100644 index 0000000..f6c1ac1 --- /dev/null +++ b/arxiv_v2_arXiv/img/Swin-Ti_ImageNet_v3.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xInA E<Of86 uE`HJ 8}&K-,~}×'?  0p$4O*/GHKhp(Ғ-D=(S,^匩1o W/5H4K#ԇpJpZBb`M?V؁+kb8N#P GZT]J-ٙ)w::p;FN.Af,5kJ6)A*6JSek}=lECrʞNja ~埸mIXT9' "ddpiȅ !*"d}OÄcw_[]oz[G6Z8%65lt+z_[ီev軋ik,v! +\ EWΦa$swp̻iNaY +endstream +endobj +12 0 obj +437 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150045+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000987 00000 n +0000000882 00000 n +0000000903 00000 n +0000000924 00000 n +0000000945 00000 n +0000000966 00000 n +0000000065 00000 n +0000000330 00000 n +0000000862 00000 n +0000000208 00000 n +0000000842 00000 n +0000001047 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1206 +%%EOF diff --git a/img/Swin-Ti_RecombNetAll_v1.pdf b/arxiv_v2_arXiv/img/Swin-Ti_RecombNet_all_v1.pdf similarity index 90% rename from img/Swin-Ti_RecombNetAll_v1.pdf rename to arxiv_v2_arXiv/img/Swin-Ti_RecombNet_all_v1.pdf index 9edf8f9..3c43d4b 100644 Binary files a/img/Swin-Ti_RecombNetAll_v1.pdf and b/arxiv_v2_arXiv/img/Swin-Ti_RecombNet_all_v1.pdf differ diff --git a/img/Swin-Ti_RecombNetAll_v2.pdf b/arxiv_v2_arXiv/img/Swin-Ti_RecombNet_all_v2.pdf similarity index 90% rename from img/Swin-Ti_RecombNetAll_v2.pdf rename to arxiv_v2_arXiv/img/Swin-Ti_RecombNet_all_v2.pdf index 5144150..757529b 100644 Binary files a/img/Swin-Ti_RecombNetAll_v2.pdf and b/arxiv_v2_arXiv/img/Swin-Ti_RecombNet_all_v2.pdf differ diff --git a/img/Swin-Ti_RecombNetAll_v3.pdf b/arxiv_v2_arXiv/img/Swin-Ti_RecombNet_all_v3.pdf similarity index 90% rename from img/Swin-Ti_RecombNetAll_v3.pdf rename to arxiv_v2_arXiv/img/Swin-Ti_RecombNet_all_v3.pdf index c0ee44c..6f7cc11 100644 Binary files a/img/Swin-Ti_RecombNetAll_v3.pdf and b/arxiv_v2_arXiv/img/Swin-Ti_RecombNet_all_v3.pdf differ diff --git a/arxiv_v2_arXiv/img/ViT-B_ImageNet_v1.pdf b/arxiv_v2_arXiv/img/ViT-B_ImageNet_v1.pdf new file mode 100644 index 0000000..8319743 --- /dev/null +++ b/arxiv_v2_arXiv/img/ViT-B_ImageNet_v1.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xM1u:AO][@XpQM z2E߫Wv?|:xre OHxBX ~@spd N!iahe 3rgߘ$)  hmTOk#&7W\!#pO;l^VkޫT*,R3ɽ@r@o;ѩijɃIJ2IZGD0N8] MIϡ?mdEC*WXV +.y\ۯ7܆aM$ +2{{_ ;Įd۟ތlƺWqJ/'*2N*{lN8\h3_mBT*5U rbRfl/] f mv_ ;/7/98g O[ +endstream +endobj +12 0 obj +440 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150046+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000990 00000 n +0000000885 00000 n +0000000906 00000 n +0000000927 00000 n +0000000948 00000 n +0000000969 00000 n +0000000065 00000 n +0000000330 00000 n +0000000865 00000 n +0000000208 00000 n +0000000845 00000 n +0000001050 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1209 +%%EOF diff --git a/arxiv_v2_arXiv/img/ViT-B_ImageNet_v2.pdf b/arxiv_v2_arXiv/img/ViT-B_ImageNet_v2.pdf new file mode 100644 index 0000000..d3cc4d6 Binary files /dev/null and b/arxiv_v2_arXiv/img/ViT-B_ImageNet_v2.pdf differ diff --git a/arxiv_v2_arXiv/img/ViT-B_ImageNet_v3.pdf b/arxiv_v2_arXiv/img/ViT-B_ImageNet_v3.pdf new file mode 100644 index 0000000..0844562 --- /dev/null +++ b/arxiv_v2_arXiv/img/ViT-B_ImageNet_v3.pdf @@ -0,0 +1,69 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn1 <O0!E"i.rvd4]daHޓׇ-~7oHoȸC#Hyo*/{gɳd.H L\S`#rC/ O͒:7CcPwd4#-[.S{)U eG a9 Cif1EDL)Ғ8̵Yg_QF١OQa#Jdoyg©ZXHDG+J({y԰FZR܀]ۯ7x[NZ¤k KNE8yQzNxƺ]kB|z3ǟ([j$))kPg+}QီA;Ŵ/h6QqʚC%*'"]VF9_ͻi +endstream +endobj +12 0 obj +424 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150045+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000974 00000 n +0000000869 00000 n +0000000890 00000 n +0000000911 00000 n +0000000932 00000 n +0000000953 00000 n +0000000065 00000 n +0000000330 00000 n +0000000849 00000 n +0000000208 00000 n +0000000829 00000 n +0000001034 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1193 +%%EOF diff --git a/img/ViT-B_RecombNetAll_v1.pdf b/arxiv_v2_arXiv/img/ViT-B_RecombNet_all_v1.pdf similarity index 90% rename from img/ViT-B_RecombNetAll_v1.pdf rename to arxiv_v2_arXiv/img/ViT-B_RecombNet_all_v1.pdf index b589def..2a70916 100644 --- a/img/ViT-B_RecombNetAll_v1.pdf +++ b/arxiv_v2_arXiv/img/ViT-B_RecombNet_all_v1.pdf @@ -43,9 +43,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094745+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150049+02'00') >> endobj xref 0 14 @@ -66,5 +66,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1204 +1206 %%EOF diff --git a/img/ViT-B_RecombNetAll_v2.pdf b/arxiv_v2_arXiv/img/ViT-B_RecombNet_all_v2.pdf similarity index 90% rename from img/ViT-B_RecombNetAll_v2.pdf rename to arxiv_v2_arXiv/img/ViT-B_RecombNet_all_v2.pdf index 43bad24..93fc253 100644 Binary files a/img/ViT-B_RecombNetAll_v2.pdf and b/arxiv_v2_arXiv/img/ViT-B_RecombNet_all_v2.pdf differ diff --git a/img/ViT-B_RecombNetAll_v3.pdf b/arxiv_v2_arXiv/img/ViT-B_RecombNet_all_v3.pdf similarity index 90% rename from img/ViT-B_RecombNetAll_v3.pdf rename to arxiv_v2_arXiv/img/ViT-B_RecombNet_all_v3.pdf index ed87bf4..e2bba06 100644 Binary files a/img/ViT-B_RecombNetAll_v3.pdf and b/arxiv_v2_arXiv/img/ViT-B_RecombNet_all_v3.pdf differ diff --git a/arxiv_v2_arXiv/img/ViT-L_ImageNet_v1.pdf b/arxiv_v2_arXiv/img/ViT-L_ImageNet_v1.pdf new file mode 100644 index 0000000..5c4d8b8 Binary files /dev/null and b/arxiv_v2_arXiv/img/ViT-L_ImageNet_v1.pdf differ diff --git a/arxiv_v2_arXiv/img/ViT-L_ImageNet_v2.pdf b/arxiv_v2_arXiv/img/ViT-L_ImageNet_v2.pdf new file mode 100644 index 0000000..b4b082e Binary files /dev/null and b/arxiv_v2_arXiv/img/ViT-L_ImageNet_v2.pdf differ diff --git a/arxiv_v2_arXiv/img/ViT-L_ImageNet_v3.pdf b/arxiv_v2_arXiv/img/ViT-L_ImageNet_v3.pdf new file mode 100644 index 0000000..bf94eea --- /dev/null +++ b/arxiv_v2_arXiv/img/ViT-L_ImageNet_v3.pdf @@ -0,0 +1,69 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMnAu:S]][@XpDH v,ذ繿^s wO?#|@G$X 7("Hv=+c/H,,Z TE i\3ɝ_tӕD7%\&qjɩ2 ,57p :$m ihJTv%(6ٷp׳bt2☘lNJՔۜ:2hZz.}vBRK0vyvp #FPLwmJf4o)J9۹v26:&KjC,#RpZʠ6]A_fW1cXMYnYCj>fNN58G  +endstream +endobj +12 0 obj +415 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150047+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000965 00000 n +0000000860 00000 n +0000000881 00000 n +0000000902 00000 n +0000000923 00000 n +0000000944 00000 n +0000000065 00000 n +0000000330 00000 n +0000000840 00000 n +0000000208 00000 n +0000000820 00000 n +0000001025 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1184 +%%EOF diff --git a/img/ViT-L_RecombNetAll_v1.pdf b/arxiv_v2_arXiv/img/ViT-L_RecombNet_all_v1.pdf similarity index 90% rename from img/ViT-L_RecombNetAll_v1.pdf rename to arxiv_v2_arXiv/img/ViT-L_RecombNet_all_v1.pdf index 824df1c..8f46491 100644 Binary files a/img/ViT-L_RecombNetAll_v1.pdf and b/arxiv_v2_arXiv/img/ViT-L_RecombNet_all_v1.pdf differ diff --git a/img/ViT-L_RecombNetAll_v2.pdf b/arxiv_v2_arXiv/img/ViT-L_RecombNet_all_v2.pdf similarity index 90% rename from img/ViT-L_RecombNetAll_v2.pdf rename to arxiv_v2_arXiv/img/ViT-L_RecombNet_all_v2.pdf index f9fd7f8..a06ee6d 100644 --- a/img/ViT-L_RecombNetAll_v2.pdf +++ b/arxiv_v2_arXiv/img/ViT-L_RecombNet_all_v2.pdf @@ -43,9 +43,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094744+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150048+02'00') >> endobj xref 0 14 @@ -66,5 +66,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1188 +1190 %%EOF diff --git a/img/ViT-L_RecombNetAll_v3.pdf b/arxiv_v2_arXiv/img/ViT-L_RecombNet_all_v3.pdf similarity index 90% rename from img/ViT-L_RecombNetAll_v3.pdf rename to arxiv_v2_arXiv/img/ViT-L_RecombNet_all_v3.pdf index 898c369..c7a96d9 100644 Binary files a/img/ViT-L_RecombNetAll_v3.pdf and b/arxiv_v2_arXiv/img/ViT-L_RecombNet_all_v3.pdf differ diff --git a/arxiv_v2_arXiv/img/ViT-S_ImageNet_v1.pdf b/arxiv_v2_arXiv/img/ViT-S_ImageNet_v1.pdf new file mode 100644 index 0000000..af1acb7 Binary files /dev/null and b/arxiv_v2_arXiv/img/ViT-S_ImageNet_v1.pdf differ diff --git a/arxiv_v2_arXiv/img/ViT-S_ImageNet_v2.pdf b/arxiv_v2_arXiv/img/ViT-S_ImageNet_v2.pdf new file mode 100644 index 0000000..25f10a4 Binary files /dev/null and b/arxiv_v2_arXiv/img/ViT-S_ImageNet_v2.pdf differ diff --git a/arxiv_v2_arXiv/img/ViT-S_ImageNet_v3.pdf b/arxiv_v2_arXiv/img/ViT-S_ImageNet_v3.pdf new file mode 100644 index 0000000..6773bd9 --- /dev/null +++ b/arxiv_v2_arXiv/img/ViT-S_ImageNet_v3.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn0s92C-uEn(@S~Ŏ`]'~ Gzt?~/`|3>!d<  W`$|c?ɶ#~ŲrJEZH$ -,Rj3rG_$% &($mnHSr{UN9+uH@nΚThR#X*jԪ}G ;倾?N\LSaYw.Ll~[ ;z5W{}!\V\F.L} rتY/ hˡE_< +endstream +endobj +12 0 obj +443 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150046+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000993 00000 n +0000000888 00000 n +0000000909 00000 n +0000000930 00000 n +0000000951 00000 n +0000000972 00000 n +0000000065 00000 n +0000000330 00000 n +0000000868 00000 n +0000000208 00000 n +0000000848 00000 n +0000001053 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1212 +%%EOF diff --git a/img/ViT-S_RecombNetAll_v1.pdf b/arxiv_v2_arXiv/img/ViT-S_RecombNet_all_v1.pdf similarity index 90% rename from img/ViT-S_RecombNetAll_v1.pdf rename to arxiv_v2_arXiv/img/ViT-S_RecombNet_all_v1.pdf index b2b176d..9d82eb3 100644 Binary files a/img/ViT-S_RecombNetAll_v1.pdf and b/arxiv_v2_arXiv/img/ViT-S_RecombNet_all_v1.pdf differ diff --git a/img/ViT-S_RecombNetAll_v2.pdf b/arxiv_v2_arXiv/img/ViT-S_RecombNet_all_v2.pdf similarity index 90% rename from img/ViT-S_RecombNetAll_v2.pdf rename to arxiv_v2_arXiv/img/ViT-S_RecombNet_all_v2.pdf index 4ceb446..56a7427 100644 Binary files a/img/ViT-S_RecombNetAll_v2.pdf and b/arxiv_v2_arXiv/img/ViT-S_RecombNet_all_v2.pdf differ diff --git a/img/ViT-S_RecombNetAll_v3.pdf b/arxiv_v2_arXiv/img/ViT-S_RecombNet_all_v3.pdf similarity index 90% rename from img/ViT-S_RecombNetAll_v3.pdf rename to arxiv_v2_arXiv/img/ViT-S_RecombNet_all_v3.pdf index b713bf2..1581fd3 100644 --- a/img/ViT-S_RecombNetAll_v3.pdf +++ b/arxiv_v2_arXiv/img/ViT-S_RecombNet_all_v3.pdf @@ -44,9 +44,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094746+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150051+02'00') >> endobj xref 0 14 @@ -67,5 +67,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1208 +1210 %%EOF diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00000090.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00000090.JPEG new file mode 100644 index 0000000..d78189d Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00000090.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00000547.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00000547.JPEG new file mode 100644 index 0000000..5924788 Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00000547.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00000890.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00000890.JPEG new file mode 100644 index 0000000..e8a988c Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00000890.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00002106.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00002106.JPEG new file mode 100644 index 0000000..6865802 Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00002106.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00002743.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00002743.JPEG new file mode 100644 index 0000000..7c6d06f Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00002743.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00003097.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00003097.JPEG new file mode 100644 index 0000000..b02df26 Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00003097.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00005045.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00005045.JPEG new file mode 100644 index 0000000..9fe9aaf Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00005045.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00007437.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00007437.JPEG new file mode 100644 index 0000000..3ed2656 Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00007437.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00008542.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00008542.JPEG new file mode 100644 index 0000000..20a73e5 Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00008542.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00009674.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00009674.JPEG new file mode 100644 index 0000000..d9ec2b7 Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00009674.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00011629.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00011629.JPEG new file mode 100644 index 0000000..b38cc44 Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00011629.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00025256.JPEG b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00025256.JPEG new file mode 100644 index 0000000..a63996c Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/comp/ILSVRC2012_val_00025256.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00003735.JPEG b/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00003735.JPEG new file mode 100644 index 0000000..fb7e134 Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00003735.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00012151.JPEG b/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00012151.JPEG new file mode 100644 index 0000000..199c627 Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00012151.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00022522.JPEG b/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00022522.JPEG new file mode 100644 index 0000000..9ec6f69 Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00022522.JPEG differ diff --git a/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00026530.JPEG b/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00026530.JPEG new file mode 100644 index 0000000..be44e0c Binary files /dev/null and b/arxiv_v2_arXiv/img/att_err_infills/high_rat/ILSVRC2012_val_00026530.JPEG differ diff --git a/arxiv_v2_arXiv/img/bates.pdf b/arxiv_v2_arXiv/img/bates.pdf new file mode 100644 index 0000000..60a7138 Binary files /dev/null and b/arxiv_v2_arXiv/img/bates.pdf differ diff --git a/arxiv_v2_arXiv/img/bg_robustness.pdf b/arxiv_v2_arXiv/img/bg_robustness.pdf new file mode 100644 index 0000000..dcfd724 Binary files /dev/null and b/arxiv_v2_arXiv/img/bg_robustness.pdf differ diff --git a/arxiv_v2_arXiv/img/colorbar_horizontal.pdf b/arxiv_v2_arXiv/img/colorbar_horizontal.pdf new file mode 100644 index 0000000..8779c08 Binary files /dev/null and b/arxiv_v2_arXiv/img/colorbar_horizontal.pdf differ diff --git a/arxiv_v2_arXiv/img/fg_focus.pdf b/arxiv_v2_arXiv/img/fg_focus.pdf new file mode 100644 index 0000000..77b9b97 Binary files /dev/null and b/arxiv_v2_arXiv/img/fg_focus.pdf differ diff --git a/arxiv_v2_arXiv/img/fig-1.pdf b/arxiv_v2_arXiv/img/fig-1.pdf new file mode 100644 index 0000000..08adcb3 Binary files /dev/null and b/arxiv_v2_arXiv/img/fig-1.pdf differ diff --git a/arxiv_v2_arXiv/img/fig-2.pdf b/arxiv_v2_arXiv/img/fig-2.pdf new file mode 100644 index 0000000..14d154e Binary files /dev/null and b/arxiv_v2_arXiv/img/fig-2.pdf differ diff --git a/arxiv_v2_arXiv/img/infill_distr.pdf b/arxiv_v2_arXiv/img/infill_distr.pdf new file mode 100644 index 0000000..c946d8c Binary files /dev/null and b/arxiv_v2_arXiv/img/infill_distr.pdf differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00000090.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00000090.JPEG new file mode 100644 index 0000000..c53d456 Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00000090.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00000547.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00000547.JPEG new file mode 100644 index 0000000..6ba3f1f Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00000547.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00000890.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00000890.JPEG new file mode 100644 index 0000000..ff4714e Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00000890.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00002106.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00002106.JPEG new file mode 100644 index 0000000..f7e2a9d Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00002106.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00002743.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00002743.JPEG new file mode 100644 index 0000000..8e3ab58 Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00002743.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00003097.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00003097.JPEG new file mode 100644 index 0000000..8c72c04 Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00003097.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00005045.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00005045.JPEG new file mode 100644 index 0000000..76f03fd Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00005045.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00007437.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00007437.JPEG new file mode 100644 index 0000000..17ffcae Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00007437.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00008542.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00008542.JPEG new file mode 100644 index 0000000..1d886dd Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00008542.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00009674.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00009674.JPEG new file mode 100644 index 0000000..c900e83 Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00009674.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00011629.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00011629.JPEG new file mode 100644 index 0000000..dafaeee Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00011629.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00025256.JPEG b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00025256.JPEG new file mode 100644 index 0000000..7d85d46 Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/comp/ILSVRC2012_val_00025256.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00003735.JPEG b/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00003735.JPEG new file mode 100644 index 0000000..68f1ba8 Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00003735.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00012151.JPEG b/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00012151.JPEG new file mode 100644 index 0000000..075d682 Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00012151.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00022522.JPEG b/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00022522.JPEG new file mode 100644 index 0000000..d0251f6 Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00022522.JPEG differ diff --git a/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00026530.JPEG b/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00026530.JPEG new file mode 100644 index 0000000..c91c435 Binary files /dev/null and b/arxiv_v2_arXiv/img/lama_infills/high_rat/ILSVRC2012_val_00026530.JPEG differ diff --git a/arxiv_v2_arXiv/img/size_bias_grid.pdf b/arxiv_v2_arXiv/img/size_bias_grid.pdf new file mode 100644 index 0000000..c6f3296 Binary files /dev/null and b/arxiv_v2_arXiv/img/size_bias_grid.pdf differ diff --git a/arxiv_v2_arXiv/main.pdf b/arxiv_v2_arXiv/main.pdf new file mode 100644 index 0000000..62eac50 Binary files /dev/null and b/arxiv_v2_arXiv/main.pdf differ diff --git a/arxiv_v2_arXiv/main.tex b/arxiv_v2_arXiv/main.tex new file mode 100644 index 0000000..650f02e --- /dev/null +++ b/arxiv_v2_arXiv/main.tex @@ -0,0 +1,48 @@ + +\documentclass[10pt,twocolumn,letterpaper]{article} + +\usepackage[pagenumbers]{cvpr} % + +\definecolor{cvprblue}{rgb}{0.21,0.49,0.74} +\usepackage[pagebackref,breaklinks,colorlinks,allcolors=cvprblue]{hyperref} +\input{packages} + +\def\paperID{4792} % +\def\confName{CVPR} +\def\confYear{2026} + +\newcommand{\name}{\textit{ForNet}\xspace} +\newcommand{\schemename}{\textit{ForAug}\xspace} +\title{\schemename: Mitigating Biases and Improving Vision Transformer Training by Recombining Foregrounds and Backgrounds} + +\author{ + Tobias Christian Nauen\textsuperscript{\rm 1,\rm 2}, + Brian Moser\textsuperscript{\rm 2}, + Federico Raue\textsuperscript{\rm 2}, + Stanislav Frolov\textsuperscript{\rm 2}, + Andreas Dengel\textsuperscript{\rm 1,\rm 2} \\ + \textsuperscript{\rm 1}RPTU University Kaiserslautern-Landau, Kaiserslautern, Germany \\ + \textsuperscript{\rm 2}German Research Center for Artificial Intelligence (DFKI), Kaiserslautern, Germany \\ + {\tt\small first\_second.last@dfki.de / first.last@dfki.de} +} + +\begin{document} +\maketitle +\input{sec/abstract} +\input{sec/intro} +\input{sec/related_work} +\input{sec/method} +\input{sec/experiments} +\input{sec/conclusion} +\input{sec/acks} +{ + \small + \bibliographystyle{ieeenat_fullname} + \bibliography{main} +} + +\appendix +\onecolumn +\input{sec/appendix} + +\end{document} diff --git a/arxiv_v2_arXiv/packages.tex b/arxiv_v2_arXiv/packages.tex new file mode 100644 index 0000000..f5fe2d3 --- /dev/null +++ b/arxiv_v2_arXiv/packages.tex @@ -0,0 +1,123 @@ +\usepackage{color} + +\usepackage{amssymb} +\usepackage{amsfonts} +\usepackage{amsmath} +\usepackage[capitalize,noabbrev]{cleveref} +\usepackage{amsxtra} +\usepackage{cancel} +\usepackage{dsfont} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage{tikz-qtree} +\usetikzlibrary{shapes} +\usetikzlibrary{positioning} +\usetikzlibrary{trees} +\usepackage{mathcomp} +\usepackage{mathtools} +\usepackage{multirow} +\usepackage{verbatim} +\usepackage{polynom} +\usepackage{textcomp} +\usepackage{float} +\usepackage{placeins} +\usepackage{xcolor} +\usepackage{pdflscape} +\usepackage{csquotes} +\usepackage{afterpage} +\usepackage{makecell} +\usepackage{listings} +\usepackage{url} +\usepackage{enumitem} +\usepackage{minibox} +\usepackage{algorithm} +\usepackage{algorithmic} +\usepackage{shuffle} +\usepackage{svg} +\usepackage{pifont} +\usepackage{subcaption} +\usepackage{xspace} +\usepackage{siunitx} +\usepackage{booktabs} +\usepackage{microtype} +\usepackage{footmisc} + +\DeclareMathSymbol{\mlq}{\mathord}{operators}{``} +\DeclareMathSymbol{\mrq}{\mathord}{operators}{`'} + + +\newcommand{\R}{\mathbb R} +\newcommand{\N}{\mathbb N} +\newcommand{\calN}{\mathcal{N}} +\newcommand{\D}{\mathbb D} +\newcommand{\calD}{\mathcal D} +\newcommand{\C}{\mathbb C} +\renewcommand{\P}{\mathbb{P}} +\newcommand{\A}{\mathcal A} +\newcommand{\B}{\mathcal B} +\newcommand{\I}{\mathcal I} +\newcommand{\Z}{\mathbb{Z}} +\newcommand{\Q}{\mathbb{Q}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\F}{\mathcal{F}} +\newcommand{\G}{\mathcal{G}} +\newcommand{\M}{\mathcal{M}} +\renewcommand{\H}{\mathcal{H}} +\newcommand{\X}{\mathbb{X}} +\newcommand{\sigB}{\mathbb B} +\newcommand{\sigE}{\mathcal{E}} +\newcommand{\Ball}[2]{B_#1(#2)} % +\renewcommand{\L}{\mathcal{L}} +\newcommand{\eps}{\varepsilon} +\newcommand{\1}{\mathds{1}} +\newcommand{\To}{\longrightarrow} +\newcommand{\eqover}[1]{\stackrel{#1}{=}} +\newcommand{\darover}[1]{\xrightarrow{#1}} +\newcommand{\id}{\mathrm{id}} +\newcommand{\del}{\partial} +\newcommand{\indep}{\perp\!\!\!\perp} +\renewcommand{\Re}{\operatorname{Re}} +\renewcommand{\phi}{\varphi} +\renewcommand{\Im}{\operatorname{Im}} +\newcommand{\cov}{\operatorname{cov}} +\newcommand{\corr}{\operatorname{corr}} +\newcommand{\att}{\operatorname{attention}} +\newcommand{\norm}[1]{\left\lVert#1\right\rVert} +\newcommand{\abs}[1]{\left| #1 \right|} +\newcommand{\mat}[4]{\begin{pmatrix} #1 & #2 \\ #3 & #4 \end{pmatrix}} +\newcommand{\softmax}{\operatorname{softmax}} +\newcommand{\argmax}{\operatorname{argmax}} +\newcommand{\suff}{\operatorname{suff}} +\newcommand{\comp}{\operatorname{comp}} +\newcommand{\In}{\operatorname{In}} +\newcommand{\Var}{\operatorname{Var}} +\newcommand{\tensor}{\otimes} +\newcommand{\bigtensor}{\bigotimes} +\newcommand{\bx}{\mathbf{x}} +\newcommand{\by}{\mathbf{y}} +\newcommand{\bz}{\mathbf{z}} +\newcommand{\bB}{\mathbf{B}} +\newcommand{\grad}{\nabla} +\newcommand{\spanop}{\operatorname{span}} +\renewcommand{\S}{\mathcal{S}} +\newcommand{\Y}{\mathbb Y} +\newcommand{\Hoel}{\text{Höl}} +\newcommand{\Tau}{\mathcal{T}} +\newcommand{\W}{\mathcal{W}} +\renewcommand{\O}{\mathcal{O}} +\newcommand{\emptyword}{\varnothing} +\newcommand{\todo}[1]{\colorbox{red}{TODO: #1}} +\newcommand{\taylorsm}{\operatorname{T-SM}} +\newcommand{\ops}{\operatorname{ops}} +\newcommand{\entr}{\operatorname{entries}} +\newcommand{\gtxt}[1]{\text{\textcolor{gray}{#1}}} +\definecolor{DarkGreen}{RGB}{34,149,34} +\newcommand{\grntxt}[1]{\text{\textcolor{ForestGreen}{#1}}} +\newcommand{\rdtxt}[1]{\text{\textcolor{red}{#1}}} +\newcommand{\code}[1]{\texttt{#1}} +\newcommand{\cmark}{\ding{51}}% +\newcommand{\xmark}{\ding{55}}% + +\newcommand*\rot{\rotatebox{90}} +\newcommand{\tldr}{\textbf{TL;DR:}\xspace} + diff --git a/arxiv_v2_arXiv/preamble.tex b/arxiv_v2_arXiv/preamble.tex new file mode 100644 index 0000000..b324c7c --- /dev/null +++ b/arxiv_v2_arXiv/preamble.tex @@ -0,0 +1,11 @@ + +\newcommand{\red}[1]{{\color{red}#1}} +\newcommand{\todo}[1]{{\color{red}#1}} +\newcommand{\TODO}[1]{\textbf{\color{red}[TODO: #1]}} + + + + + + + diff --git a/arxiv_v2_arXiv/sec/abstract.tex b/arxiv_v2_arXiv/sec/abstract.tex new file mode 100644 index 0000000..a97c349 --- /dev/null +++ b/arxiv_v2_arXiv/sec/abstract.tex @@ -0,0 +1,13 @@ + +\begin{abstract} + Transformers, particularly Vision Transformers (ViTs), have achieved state-of-the-art performance in large-scale image classification. + However, they often require large amounts of data and can exhibit biases, such as center or size bias, that limit their robustness and generalizability. + This paper introduces \schemename, a novel data augmentation operation that addresses these challenges by explicitly imposing invariances into the training data, which are otherwise part of the neural network architecture. + \schemename is constructed by using pretrained foundation models to separate and recombine foreground objects with different backgrounds. + This recombination step enables us to take fine-grained control over object position and size, as well as background selection. + We demonstrate that using \schemename significantly improves the accuracy of ViTs and other architectures by up to 4.5 percentage points (p.p.) on ImageNet, which translates to 7.3 p.p. on downstream tasks. + Importantly, \schemename not only improves accuracy but also opens new ways to analyze model behavior and quantify biases. + Namely, we introduce metrics for background robustness, foreground focus, center bias, and size bias and show that using \schemename during training substantially reduces these biases. + In summary, \schemename provides a valuable tool for analyzing and mitigating biases, enabling the development of more robust and reliable computer vision models. + Our code and dataset are publicly available at \code{https://github.com/tobna/ForAug}. +\end{abstract} diff --git a/arxiv_v2_arXiv/sec/acks.tex b/arxiv_v2_arXiv/sec/acks.tex new file mode 100644 index 0000000..b85cbae --- /dev/null +++ b/arxiv_v2_arXiv/sec/acks.tex @@ -0,0 +1,6 @@ + +\subsection*{Acknowledgements} +\label{sec:acknowledgements} + +This work was funded by the Carl-Zeiss Foundation under the Sustainable Embedded AI project (P2021-02-009). by the EU project SustainML (Horizon Europe grant agreement No 101070408) and by the BMFTR project Albatross (funding code 16IW24002). +All compute was done thanks to the Pegasus cluster at DFKI Kaiserslautern. diff --git a/arxiv_v2_arXiv/sec/appendix.tex b/arxiv_v2_arXiv/sec/appendix.tex new file mode 100644 index 0000000..b3d3dd8 --- /dev/null +++ b/arxiv_v2_arXiv/sec/appendix.tex @@ -0,0 +1,184 @@ + +\section{Extended Bates Distribution} +\begin{figure}[h!] + \centering + \includegraphics[width=.5\columnwidth]{img/bates.pdf} + \caption{Plot of the probability distribution function (PDF) of the extended Bates distribution for different parameters $\eta$. Higher values of $\eta$ concentrate the distribution around the center.} + \label{fig:bates-pdf} +\end{figure} + + +We introduce an extension of the Bates distribution~\cite{Bates1955} to include negative parameters, enabling sampling of foreground object positions away from the image center. +The standard Bates distribution, for $\eta \in \N$, is defined as the mean of $\eta$ independent random variables drawn from a uniform distribution \cite{Jonhson1995}. +A larger $\eta$ value increases the concentration of samples around the distribution's mean, which in this case is the image center. + +To achieve an opposite effect--concentrating samples at the image borders--we extend the distribution to $\eta \leq 1$. +\begin{align*} + X \sim \text{Bates}(\eta) :\Leftrightarrow s(X) \sim \text{Bates}(-\eta) +\end{align*} +This is accomplished by sampling from a standard Bates distribution with parameter $-\eta \geq 1$ and then applying a sawtooth function. +The sawtooth function on the interval $[0,1]$ is defined as +\begin{align} + s(x) = \begin{cases} + x + 0.5 & \text{if } 0 < x < 0.5 \\ + x - 0.5 & \text{if } 0.5 \leq x \leq 1 + \end{cases} +\end{align} +This function effectively maps the central portion of the interval to the edges and the edge portions to the center. +For example, a value of 0.3 (central-left) is mapped to 0.8 (edge-right), while 0.8 (edge-right) is mapped to 0.3 (central-left). +This transformation inverts the distribution's concentration, shifting the probability mass from the center to the borders. +We visualize the distribution function of the extended Bates distribution in \Cref{fig:bates-pdf}. +Both $\eta = 1$ and $\eta = -1$ result in a uniform distribution across the image. + +\section{Resource Usage of \schemename} +To utilize the proposed \schemename, specific computational resources are necessary, particularly for computing and storing for the output of the segmentation stage and for on-the-fly processing of the recombination stage. + +\paragraph{Segmentation.} +\schemename involves a computationally expensive segmentation and infill stage, which is a one-time calculation per dataset. +Once computed, the segmentation and infill results can be perpetually reused, amortizing the initial cost over all subsequent experiments and applications. +On NVIDIA H100 GPUs, the segmentation stage will compute at a rate of $374.3 \frac{\text{img}}{\text{GPU} \times \text{h}}$ when using Attentive Eraser or $5 338.6 \frac{\text{img}}{\text{GPU} \times \text{h}}$ for LaMa. +For ImageNet this comes down to just under 9 days (Attentive Eraser) or 16 hours (LaMa) on two 8 GPU nodes. +To facilitate immediate use and reproduction of results, we publicly provide the precalculated segmentation stage output for the ImageNet dataset for download\footnote{Link will go here.}. +The output of \schemename's segmentation step on ImageNet dataset requires 73 GB of additional disk space for the segmentation output, which is separate from the base 147 GB ImageNet size. + +\paragraph{Recombination.} +The recombination step of \schemename is implemented as a based data loader operation. +It's thus offloaded to the CPU, where it can be heavily parallelized and thus only results in a very minor increase in the training step-time. +For example, using a ViT-B model on an NVIDIA A100 GPU, the average update step-time increased by $1\%$, from $528 \pm 2$ ms to $534 \pm 1$ ms. + + +\section{Training Setup} +\label{sec:training_setup} + +\begin{table*}[h!] + \centering + \caption{Training setup and hyperparameters for our ImageNet training.} + \label{tab:in-setup} + \begin{tabular}{lcc} + \toprule + Parameter & ViT, Swin, ResNet & DeiT \\ + \midrule + Image Resolution & $224 \times 224$ & $224 \times 224$ \\ + Epochs & 300 & 300 \\ + Learning Rate & 3e-3 & S/B: 1e-3, L: 5e-4 \\ + Learning Rate Schedule & cosine decay & cosine decay \\ + Batch Size & 2048 & 1024 \\ + GPUs & $4\times$ NVIDIA A100/H100/H200 & $4\times$ NVIDIA A100/H100/H200 \\ + Warmup Schedule & linear & linear \\ + Warmup Epochs & 3 & 3 \\ + Weight Decay & 0.02 & 0.05 \\ + Label Smoothing & 0.1 & 0.1 \\ + Optimizer & Lamb \cite{You2020} & AdamW \\ + \cmidrule(r){1-1} + Data Augmentation Policy & \textbf{3-Augment \cite{Touvron2022}} & \textbf{DeiT \cite{Touvron2021b}} \\ + Augmentations & \makecell{Resize \\ RandomCrop \\ HorizontalFlip \\ Grayscale \\ Solarize \\ GaussianBlur \\ ColorJitter \\ CutMix \cite{Yun2019}} & \makecell{RandomResizedCrop \\ HorizontalFlip \\ RandomErase \cite{Zhong2017} \\ RandAugment \cite{Cubuk2019} \\ ColorJitter \\ Mixup \cite{Zhang2018a} \\ CutMix \cite{Yun2019}} \\ + \bottomrule + \end{tabular} +\end{table*} + +\begin{table}[h!] + \centering + \caption{Training setup for finetuning on different downstream datasets. Other settings are the same as in \Cref{tab:in-setup}. For finetuning, we always utilize 3-Augment and the related parameters from the \emph{ViT, Swin, ResNet} column of \Cref{tab:in-setup}} + \label{tab:downstream-setup} + \begin{tabular}{lcccc} + \toprule + Dataset & Batch Size & Epochs & Learning Rate & Num. GPUs \\ + \midrule + Aircraft & 512 & 500 & 3e-4 & 2 \\ + Cars & 1024 & 500 & 3e-4 & 4 \\ + Flowers & 256 & 500 & 3e-4 & 1 \\ + Food & 2048 & 100 & 3e-4 & 4 \\ + Pets & 512 & 500 & 3e-4 & 2 \\ + \bottomrule + \end{tabular} +\end{table} +On ImageNet we use the same training setup as \cite{Nauen2025} and \cite{Touvron2022} without pretraining for ViT, Swin, and ResNet. +For DeiT, we train the same ViT architecture but using the data augmentation scheme and hyperparameters from \cite{Touvron2021b}. +As our focus is on evaluating the changes in accuracy due to \schemename, like \cite{Nauen2025}, we stick to one set of hyperparameters for all models. +We list the settings used for training on ImageNet in \Cref{tab:in-setup} and the ones used for finetuning those weights on the downstream datasets in \Cref{tab:downstream-setup}. +Out implementation is using PyTorch \cite{Paszke2019} and the \emph{timm} library \cite{Wightman2019} for model architectures and basic functions. + +\begin{table*}[h!] + \centering + \caption{Hardware and Software specifics used for both training and evaluation.} + \label{tab:hw-sw-versions} + \begin{tabular}{ll} + \toprule + Parameter & Value \\ + \midrule + GPU & NVIDIA A100/H100/H200 \\ + CPU & 24 CPU cores (Intex Xenon) per GPU \\ + Memory & up to 120GB per GPU \\ + Operating System & Enroot container for SLURM based on Ubuntu 24.04 LTS \\ + Python & 3.12.3 \\ + PyTorch & 2.7.0 \\ + TorchVision & 0.22.0 \\ + Timm & 1.0.15 \\ + \bottomrule + \end{tabular} +\end{table*} +\Cref{tab:hw-sw-versions} lists the specific hardware we use, as well as versions of the relevant software packages. + + +\section{Infill Model Comparison} +\begin{table*}[h!] + \centering + \caption{Example infills of LaMa and Attentive Eraser.} + \label{tab:infill-examples} + \resizebox{.9\textwidth}{!}{ + \begin{tabular}{cc@{\hskip 0.3in}cc} + \toprule + LaMa & Att. Eraser & LaMa & Att. Eraser \\ + \midrule + \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00000090.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00000090.JPEG} & + \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00000890.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00000890.JPEG} \\ + \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00002106.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00002106.JPEG} & + \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00005045.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00005045.JPEG} \\ + \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00007437.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00007437.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00008542.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00008542.JPEG} \\ + \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00009674.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00009674.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00002743.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00002743.JPEG} \\ + \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00003097.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00003097.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00011629.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00011629.JPEG} \\ + \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00000547.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00000547.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00025256.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00025256.JPEG} \\ + \bottomrule + \end{tabular} + } +\end{table*} +We visualize example infilled images for both LaMa \cite{Suvorov2021} and Attentive Eraser \cite{Sun2024} in \Cref{tab:infill-examples}. +We qualitatively find that while LaMa often leaves repeated textures of blurry spots where the object was erased, Attentive Eraser produces slightly cleaner and more coherent infills of the background. + +\newpage +\section{Image Infill Ratio} +\begin{table*}[h!] + \centering + \caption{Example infills with a large relative foreground area size that is infilled (infill ratio).} + \label{tbl:high-rat} + \resizebox{.8\textwidth}{!}{ + \begin{tabular}{ccc} + \toprule + Infill Ratio & LaMa & Att. Eraser \\ + \midrule + 93.7 & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} \\ \\ + 95.7 & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} \\ \\ + 83.7 & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} \\ \\ + 88.2 & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00026530.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00026530.JPEG}} + \end{tabular}} +\end{table*} + +\begin{figure} + \centering + \includegraphics[width=.9\textwidth]{img/infill_distr.pdf} + \caption{We plot the distribution of the relative size of the detected foreground object that is infilled in our Segmentation step of ImageNet. + While most images contain objects of smaller size, there is a peak where Grounded~SAM~\cite{Ren2024} detects almost the whole image as the foreground object. For examples of such large infills, see \Cref{tbl:high-rat}. + } + \label{fig:infill-distr} +\end{figure} + +\Cref{tbl:high-rat} shows infills for images where Grounded SAM \cite{Ren2024} marks a high percentile of the image as the foreground object (Infill Ratio), that has to be erased by the infill models. +While LaMa tends to fill those spots with mostly black or gray and textures similar to what we saw in \Cref{tab:infill-examples}, Attentive Eraser tends to create novel patterns by copying what is left of the background all over the rest of the image. +\Cref{fig:infill-distr} plots the distribution of infill ratios in \schemename. +While there is a smooth curve of the number of detections decreasing with the infill ratio until $\approx 90\%$, there is an additional peak at $\approx 100\%$ infill ratio. +We believe that this peak is made up of failure cases of Grounded~SAM. + +We filter out all backgrounds that have an infill ratio larger than our pruning threshold $t_\text{prune} = 0.8$, which translates to $10\%$ of backgrounds. + + + diff --git a/arxiv_v2_arXiv/sec/conclusion.tex b/arxiv_v2_arXiv/sec/conclusion.tex new file mode 100644 index 0000000..81a22a6 --- /dev/null +++ b/arxiv_v2_arXiv/sec/conclusion.tex @@ -0,0 +1,10 @@ + +\section{Discussion \& Conclusion} +\label{sec:conclusion} + +We introduce \schemename, a novel data augmentation scheme that facilitates improved Transformer training for image classification. +By explicitly separating and recombining foreground objects and backgrounds, \schemename enables controlled data augmentation beyond existing image compositions, leading to significant performance gains on ImageNet and downstream fine-grained classification tasks. +Furthermore, \schemename provides a powerful framework for analyzing model behavior and quantifying biases, including background robustness, foreground focus, center bias, and size bias. +Our experiments demonstrate that training using \schemename not only boosts accuracy but also significantly reduces these biases, resulting in more robust and generalizable models. +In the future, we see \schemename be also applied to other datasets and tasks, like video recognition or segmentation. +\schemename's ability to both improve performance and provide insights into model behavior makes it a valuable tool for advancing CV research and developing more reliable AI systems. diff --git a/arxiv_v2_arXiv/sec/experiments.tex b/arxiv_v2_arXiv/sec/experiments.tex new file mode 100644 index 0000000..3d6c7b8 --- /dev/null +++ b/arxiv_v2_arXiv/sec/experiments.tex @@ -0,0 +1,411 @@ + +\section{Experiments} +\label{sec:experiments} + + +We conduct a comprehensive suit of experiments to validate the effectiveness of our approach, +comparing ImageNet-training with and without \schemename for 10 different models. +Furthermore, we assess the impact of using \schemename for pretraining on multiple fine-grained downstream datasets. +Finally, we exploit \schemename's control over the image distribution to quantify model behaviors and biases. +We always report the mean and standard deviation of three independent training runs. + +\subsection{Design Choices of ForAug} +\label{sec:ablation} + +We start by ablating the design choices of \schemename on TinyImageNet~\cite{Le2015}, a subset of ImageNet containing 200 categories with 500 images each. % +\Cref{tab:ablation-segment} presents ablations for segmentation and \Cref{tab:ablation-recombine} for recombination. + +\begin{table} + \caption{Ablation of the design decisions in the segmentation phase of \schemename on TinyImageNet. + The first line is our baseline, while the other lines are using \schemename. + We use basic settings with the \emph{same} background strategy during recombination for this experiment. + } + \label{tab:ablation-segment} + \centering + \small + \resizebox{.9\columnwidth}{!}{ + \begin{tabular}{cccc} + \toprule + \multirow{2.5}{*}{\makecell{Detect. \\Prompt}} & \multirow{2.5}{*}{\makecell{Infill \\ Model}} & \multicolumn{2}{c}{TinyImageNet Accuracy [\%]} \\ + \cmidrule{3-4} + & & ViT-Ti & ViT-S \\ + \midrule + \multicolumn{2}{l}{\textbf{TinyImageNet}} & $66.1 \pm 0.5$ & $68.3 \pm 0.7$ \\ + specific & LaMa \cite{Suvorov2021} & $65.5 \pm 0.4$ & $71.2 \pm 0.5$ \\ + general & \gtxt{LaMa \cite{Suvorov2021}} & $66.4 \pm 0.6$ & $72.9 \pm 0.6$ \\ + \gtxt{general} & Att. Eraser \cite{Sun2024} & $67.5 \pm 1.2$ & $72.4 \pm 0.5$ \\ + \bottomrule + \end{tabular}} +\end{table} + +\begin{table}[t] + \caption{Ablation of the recombination phase of \schemename on TinyImageNet (top) and ImageNet (bottom). The first experiments use the initial segmentation settings with LaMa \cite{Suvorov2021}.} + \label{tab:ablation-recombine} + \centering + \resizebox{\columnwidth}{!}{ + \begin{tabular}{ccccccccccc} + \toprule + \multirow{2.5}{*}{\makecell{FG. \\size}} & \multirow{2.5}{*}{\makecell{Augment.\\Order}} & \multirow{2.5}{*}{\makecell{BG\\Strat.}} & \multirow{2.5}{*}{\makecell{BG.\\Prune}} & \multirow{2.5}{*}{\makecell{Original\\Mixing}} & \multirow{2.5}{*}{\makecell{Edge\\Smooth.}} & \multicolumn{2}{c}{Accuracy [\%]} \\ + \cmidrule{7-8} + & & & & & & ViT-Ti & ViT-S \\ + \midrule + \multicolumn{6}{l}{\textbf{TinyImageNet}} & \gtxt{$66.1\pm0.5$} & \gtxt{$68.3\pm0.7$} \\ + mean & crop$\to$paste & same & - & - & \gtxt{-} & $64.6\pm0.5$ & $70.0\pm0.6$ \\ + range & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $65.5\pm0.4$ & $71.2\pm0.5$ \\ + \midrule + {range} & {crop$\to$paste} & {same} & {-} & {-} & {-} & $67.5\pm1.2$ & $72.4\pm0.5$ \\ + \gtxt{range} & paste$\to$crop & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.1\pm1.2$ & $72.9\pm0.5$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 1.0 & \gtxt{-} & \gtxt{-} & $67.0\pm1.2$ & $73.0\pm0.3$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.8 & \gtxt{-} & \gtxt{-} & $67.2\pm1.2$ & $72.9\pm0.8$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.6 & \gtxt{-} & \gtxt{-} & $67.5\pm1.0$ & $72.8\pm0.7$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.2$ & \gtxt{-} & $69.8\pm0.5$ & $75.0\pm0.3$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.33$ & \gtxt{-} & $69.5\pm0.4$ & $75.2\pm1.0$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.5$ & \gtxt{-} & $70.3\pm1.0$ & $74.2\pm0.2$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & linear & \gtxt{-} & $70.1\pm0.7$ & $74.9\pm0.8$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & reverse lin. & \gtxt{-} & $67.6\pm0.2$ & $73.2\pm0.3$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & cos & \gtxt{-} & $71.3\pm1.0$ & $75.7\pm0.8$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & $70.0\pm0.8$ & $75.5\pm0.7$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & orig. & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $67.2\pm0.9$ & $69.9\pm1.0$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $70.1\pm0.7$ & $77.5\pm0.6$ \\ + \midrule + \multicolumn{6}{l}{\textbf{ImageNet}} & \gtxt{-} & \gtxt{$79.1\pm0.1$} \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & \gtxt{-} & - & $80.5\pm0.1$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & - & $80.7\pm0.1$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & - & $81.4\pm0.1$ \\ + \bottomrule + \end{tabular}} +\end{table} + + +\textbf{Prompt.} +First, we evaluate the type of prompt used to detect the foreground object. +Here, the \emph{general} prompt, which contains the class and the more general object category, outperforms only having the class name (\emph{specific}). + +\textbf{Inpainting.} Among inpainting models, Attentive Eraser~\cite{Sun2024} produces slightly better results compared to LaMa~\cite{Suvorov2021} ($+0.5$ p.p. on average). +For inpainting examples, see the supplementary material. + +\textbf{Foreground size} +significantly impacts performance. +Employing a \emph{range} of sizes during recombination, rather than a fixed \emph{mean} size, boosts accuracy by approximately 1 p.p. +This suggests that the added variability is beneficial. + +\textbf{Order of data augmentation.} +Applying all augmentations after foreground-background recombination (\emph{paste$\to$crop$\to$color}) improves ViT-S's performance compared to applying crop-related augmentations before pasting (\emph{crop$\to$paste$\to$color}). +ViT-Ti results are ambiguous. + +\textbf{Background pruning.} +When it comes to the backgrounds to use, we test different pruning thresholds ($t_\text{prune}$) to exclude backgrounds with large inpainting. +A threshold of $t_\text{prune}=1.0$ means that we use all backgrounds that are not fully infilled. +Varying $t_\text{prune}$ has minimal impact. +We choose $t_\text{prune} = 0.8$ to exclude predominantly artificial backgrounds. + +\textbf{Mixing} \schemename-augmented samples with the original ImageNet data proves crucial. +While constant and linear mixing schedules improve performance over no mixing by $2-3$ p.p. compared to only augmented samples, the cosine annealing schedule proves optimal, boosting accuracy by $3-4$ p.p. + +\textbf{Edge smoothing.} +We evaluate the impact of using Gaussian blurring to smooth the edges of the foreground masks. +For larger models, this gives us a slight performance boost on the full ImageNet (second to last line in \Cref{tab:ablation-recombine}). + +\textbf{Background strategy.} +Another point is the allowed choice of background image for each foreground object. +We compare using the original background, a background from the same class, and any background. +These strategies go from low diversity and high shared information content between the foreground and background to high diversity and low shared information content. +For \emph{ViT-Ti}, the latter two strategies perform comparably, while \emph{ViT-S} benefits from the added diversity of using any background. +The same is true when training on the full ImageNet. + + +\begin{table} + \caption{Accuracy of ViT-S on TinyImageNet (TIN) in percent using \schemename with different foreground position distributions by varying the Bates parameter $\eta$. + The best performance is achieved when using the uniform distribution ($\eta=1$) for training.} + \label{tbl:foreground-eta} + \centering + \small + \resizebox{.9\columnwidth}{!}{ + \begin{tabular}{ccccccc} + \toprule + \multirow{2.5}{*}{\makecell{Bates Parameter \\during training}} & \multirow{2.5}{*}{\makecell{TIN \\w/o \schemename}} & \multicolumn{5}{c}{TIN w/ \schemename} \\ + \cmidrule(l){3-7} + & & $\eta=-3$ & $-2$ & $1/-1$ & $2$ & $3$ \\ + \midrule + Baseline & 68.9 & 60.5 & 60.2 & 60.8 & 62.6 & 63.1 \\ + $\eta=-3$ & 71.3 & 79.3 & 79.5 & 79.1 & 79.3 & 79.1 \\ + $\eta=-2$ & 71.5 & 80.0 & 78.7 & 79.3 & 79.1 & 78.8 \\ + $\eta=1/-1$ & 72.3 & 79.5 & 78.9 & 80.2 & 79.7 & 80.4 \\ + $\eta=2$ & 71.3 & 78.2 & 77.8 & 79.1 & 79.6 & 79.9 \\ + $\eta=3$ & 71.4 & 77.2 & 76.9 & 78.6 & 79.6 & 79.7 \\ + \bottomrule + \end{tabular}} +\end{table} + +\textbf{Foreground position.} +Finally, we analyze the foreground object's positioning in the image, using a +generalization of the Bates distribution~\cite{Bates1955} with parameter $\eta \in \Z$. +The Bates distribution presents an easy way to sample from a bounded domain with just one hyperparameter that controls its concentration. +$\eta = 1/-1$ corresponds to the uniform distribution; $\eta > 1$ concentrates the distribution around the center; and for $\eta < -1$, the distribution is concentrated at the borders (see supplementary material for details). +When sampling more towards the center of the image, the difficulty of the task is reduced, which reduces performance on TinyImageNet (\Cref{tbl:foreground-eta}). +This is reflected in the performance when evaluating using \schemename with $\eta=2$ and $\eta=3$ compared to $\eta=-1/1$. +We observe a similar reduction for $\eta < -1$. + +\begin{table} + \caption{Dataset statistics for TinyImageNet and ImageNet with and without \schemename. For \schemename we report the number of foreground/background pairs.} + \label{tab:dataset-stats} + \centering + \resizebox{.9\columnwidth}{!}{ + \begin{tabular}{l S[table-format=4.0] S[table-format=7.0] S[table-format=5.0]} + \toprule + Dataset & {Classes} & {\makecell{Training \\ Images}} & {\makecell{Validation \\ Images}} \\ + \midrule + TinyImageNet & 200 & 100000 & 10000 \\ + TinyImageNet + \schemename & 200 & 99404 & 9915 \\ + ImageNet & 1000 & 1281167 & 50000 \\ + ImageNet + \schemename & 1000 & 1274557 & 49751 \\ + \bottomrule + \end{tabular}} +\end{table} +After fixing the optimal design parameters in \Cref{tab:ablation-segment,tab:ablation-recombine} (last rows), we run \schemename's segmentation step on the entire ImageNet dataset. +\Cref{tab:dataset-stats} shows the resulting dataset statistics. +The slightly reduced image count for \schemename is due to instances where Grounded SAM fails to produce valid segmentation masks. + + +\subsection{Image Classification Results} + +\begin{table} + \caption{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.} + \label{tab:imagenet-results} + \centering + \small + \resizebox{.8\columnwidth}{!}{\begin{tabular}{lccc} + \toprule + \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\ + \cmidrule(lr){2-3} + & w/o \schemename & w/ \schemename & \\ + \midrule + ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{$+2.3$} \\ + ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{$+3.5$} \\ + ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{$+4.5$} \\ + \midrule + DeiT-S & $80.1 \pm 0.1$ & $80.0\pm0.3$ & \gtxt{$-0.1$} \\ + DeiT-B & $81.9 \pm 0.3$ & $81.9\pm0.2$ & \gtxt{$\pm0.0$} \\ + DeiT-L & $79.3\pm2.3$ & $82.4\pm0.1$ & \grntxt{$+3.1$} \\ + \midrule + Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{$+1.8$} \\ + Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{$+1.2$} \\ + \midrule + ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{$+0.5$} \\ + ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\ + \bottomrule + \end{tabular}} +\end{table} + +\Cref{tab:imagenet-results} compares the ImageNet performance of models trained with and without \schemename. +We adopt the training setup of \cite{Nauen2025} and \cite{Touvron2022} for training ViT \cite{Dosovitskiy2021}, Swin \cite{Liu2021} and ResNet \cite{He2016} (representing CNNs) models as well as the setup of DeiT \cite{Touvron2021b} for that model. +Both setups are using strong data augmentations like RandAugment, CutMix, and Mixup optimized for Transformers (details in supplementary material). +Notably, \schemename improves performance across all tested architectures, including the ResNet models, % +demonstrating benefits beyond Transformers. +For DeiT we only observe benefits on ImageNet for the larger models. +For other transformers, we observe improvements from $1.2$ p.p. to $4.5$ p.p. with increasing gains for larger models. +\schemename's improvements counteract the drop in performance for increasing model sizes. +Without \schemename this drop is $3.8$ p.p. (ViT-S to L), while with \schemename it is reduced to $1.6$ p.p. +For DeiT there is a drop of $0.8$ p.p. from small to large while when using \schemename there is a \emph{gain} of $2.4$ p.p. + +\begin{table} + \caption{Comparison of \schemename and simple Copy-Paste methods. We train ViT-S on ImageNet using the same 3-augment data augmentation on top of the copy-paste augmentation.} + \label{tab:copy-paste-comparison} + \centering + \resizebox{\columnwidth}{!}{ + \begin{tabular}{lcc S[table-format=+2.1,retain-explicit-plus,detect-inline-weight=math,detect-weight=true]} + \toprule + Augmentation & labels & \makecell{ Accuracy [\%]} & {\makecell{Delta \\to Prev.}} \\ + \midrule + Baseline + \textbf{Simple Copy-Paste} & bg & $31.3 \pm 0.6$ & \\ + + mixed labels & fg + bg & $32.0 \pm 0.8$ & +0.7 \\ + + fg labels & fg & $31.6 \pm 0.9$ & -0.4 \\ + + \emph{range} foreground size variation & \gtxt{fg} & $43.0 \pm 1.2$ & \bfseries +11.4 \\ + + infilled backgrounds & \gtxt{fg} & $68.7 \pm 0.2$ & \bfseries +25.7 \\ + + \emph{cos} mixing strategy & \gtxt{fg} & $81.2 \pm 0.1$ & \bfseries +12.5 \\ + + edge smoothing & \gtxt{fg} & $81.3 \pm 0.1$ & +0.1 \\ + + background pruning$=$ \textbf{\schemename} & \gtxt{fg} & $81.4 \pm 0.1$ & +0.1 \\ + \bottomrule + \end{tabular}} +\end{table} +\textbf{Comparison to Simple Copy-Paste.} +We compare \schemename to a simple adaption of the Copy-Paste augmentation inspired by \cite{Ge2023,Ghiasi2020,Shermaine2025} in \Cref{tab:copy-paste-comparison}. +Contrary to semantic segmentation we do not have foreground masks available. +Thus, we paste the extracted foreground objects from \emph{\schemename's segmentation stage} onto normal ImageNet images. +We observe 3 large jumps in accuracy: (\textbf{1}) From our \emph{range} foreground size variation (+11.4\%), (\textbf{2}) from using our infilled backgrounds instead of images from the dataset (+25.7\%), and (\textbf{3}) from our \emph{cos} mixing strategy with non-augmented images (+12.5\%). +\schemename's changes to the naive copy-paste augmentation are thus imperative for good classification performance. + +\begin{table}[t] + \caption{Downstream accuracy in percent when finetuning on other datasets. Models are pretrained on ImageNet with and without \schemename. Pretraining using \schemename increases transformer downstream accuracy. + } + \label{tab:downstream-results} + \centering + \resizebox{\columnwidth}{!}{\begin{tabular}{lcccccc} + \toprule + Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\ + \midrule + ViT-S & \xmark & $72.4\pm1.0$ & $89.8\pm0.3$ & $94.5\pm0.2$ & $89.1\pm0.1$ & $93.8\pm0.2$ \\ + ViT-S & \cmark & $78.6\pm0.5$ & $92.2\pm0.2$ & $95.5\pm0.2$ & $89.6\pm0.1$ & $94.5\pm0.2$ \\ + & & \grntxt{$+6.2$} & \grntxt{$+2.4$} & \grntxt{$+1.0$} & \grntxt{$+0.5$} & \grntxt{$+0.7$} \\ + \cmidrule(r){1-1} + ViT-B & \xmark & $71.7\pm0.5$ & $90.0\pm0.2$ & $94.8\pm0.4$ & $89.8\pm0.2$ & $94.1\pm0.4$ \\ + ViT-B & \cmark & $79.0\pm2.2$ & $93.3\pm0.1$ & $ 96.5\pm0.1$ & $90.9\pm0.1$ & $95.1\pm0.4$ \\ + & & \grntxt{$+7.3$} & \grntxt{$+3.3$} & \grntxt{$+1.7$} & \grntxt{$+1.1$} & \grntxt{$+1.0$} \\ + \cmidrule(r){1-1} + ViT-L & \xmark & $72.1\pm1.0$ & $88.8\pm0.3$ & $94.4\pm0.3$ & $90.1\pm0.2$ & $94.2\pm0.4$ \\ + ViT-L & \cmark & $77.6\pm1.2$ & $89.1\pm0.2$ & $96.6\pm0.1$ & $91.3\pm0.1$ & $95.1\pm0.1$ \\ + & & \grntxt{$+5.5$} & \grntxt{$+0.3$} & \grntxt{$+2.2$} & \grntxt{$+1.2$} & \grntxt{$+0.9$} \\ + \midrule + DeiT-S & \xmark & $75.3\pm0.4$ & $91.1\pm0.2$ & $94.8\pm0.4$ & $89.2\pm0.2$ & $92.4\pm0.2$ \\ + DeiT-S & \cmark & $76.8\pm0.8$ & $91.9\pm0.2$ & $95.2\pm0.3$ & $89.1\pm0.2$ & $92.3\pm0.4$ \\ + & & \grntxt{$+1.5$} & \grntxt{$+0.8$} & \grntxt{$+0.4$} & \gtxt{$-0.1$} & \gtxt{$-0.1$} \\ + \cmidrule(r){1-1} + DeiT-B & \xmark & $77.0\pm1.2$ & $92.9\pm0.2$ & $96.1\pm0.2$ & $91.2\pm0.1$ & $93.3\pm0.4$ \\ + DeiT-B & \cmark & $79.3\pm0.3$ & $93.1\pm0.1$ & $96.4\pm0.2$ & $91.3\pm0.1$ & $93.3\pm0.1$ \\ + & & \grntxt{$+2.3$} & \gtxt{$+0.2$} & \grntxt{$+0.3$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\ + \cmidrule(r){1-1} + DeiT-L & \xmark & $72.8\pm5.5$ & $92.8\pm1.0$ & $95.8\pm1.5$ & $90.5\pm2.6$ & $92.4\pm2.0$ \\ + DeiT-L & \cmark & $78.8\pm0.8$ & $93.8\pm0.2$ & $97.0\pm0.2$ & $92.0\pm0.2$ & $93.5\pm0.2$ \\ + & & \grntxt{$+6.0$} & \grntxt{$+1.0$} & \grntxt{$+1.2$} & \grntxt{$+1.5$} & \grntxt{$+1.1$} \\ + \midrule + Swin-Ti & \xmark & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\ + Swin-Ti & \cmark & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\ + & & \grntxt{$+4.1$} & \grntxt{$+2.5$} & \grntxt{$+0.3$} & \grntxt{$+0.4$} & \grntxt{$+0.6$} \\ + \cmidrule(r){1-1} + Swin-S & \xmark & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\ + Swin-S & \cmark & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\ + & & \grntxt{$+5.7$} & \grntxt{$+2.1$} & \grntxt{$+1.4$} & \gtxt{$+0.1$} & \grntxt{$+0.5$} \\ + \midrule + ResNet-50 & \xmark & $78.2\pm0.5$ & $89.8\pm0.2$ & $91.7\pm0.4$ & $84.4\pm0.2$ & $93.7\pm0.3$ \\ + ResNet-50 & \cmark & $80.3\pm0.4$ & $90.4\pm0.2$ & $91.7\pm0.2$ & $84.5\pm0.2$ & $93.7\pm0.3$ \\ + & & \grntxt{$+2.1$} & \grntxt{$+0.6$} & \gtxt{$\pm0.0$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\ + \cmidrule(r){1-1} + ResNet-101 & \xmark & $78.4\pm0.6$ & $90.3\pm0.1$ & $91.2\pm0.5$ & $86.0\pm0.2$ & $94.3\pm0.2$ \\ + ResNet-101 & \cmark & $81.4\pm0.5$ & $91.3\pm0.1$ & $92.9\pm0.2$ & $86.3\pm0.1$ & $94.0\pm0.3$ \\ + & & \grntxt{$+3.0$} & \grntxt{$+1.3$} & \grntxt{$+1.7$} & \grntxt{$+0.3$} & \textcolor{red}{$-0.3$} \\ + \bottomrule + \end{tabular}} +\end{table} + +\textbf{Downstream tasks.} To assess the transferability of \schemename-trained models, we finetune models pretrained on ImageNet with and without \schemename on five fine-grained datasets: +FGVC-Aircraft \cite{Maji2013}, Stanford Cars~\cite{Dehghan2017}, Oxford Flowers \cite{Nilsback2008}, Food-101 \cite{Kaur2017}, and Oxford-IIIT Pets \cite{Parkhi2012}. +In \Cref{tab:downstream-results} we see transformer accuracies improve on all these datasets by up to 7.3 p.p. +Notably, training with \schemename boosts the downstream performance of DeiT-S and DeiT-B, despite similar ImageNet results. +This shows the improved representations from training with \schemename translate to gains beyond better ImageNet scores. + + +\subsection{Bias and Robustness Evaluation} +Beyond its use for training, \schemename's unique properties and controlled data generation capabilities make it a powerful tool for analyzing behavior and biases of black-box models. + +\begin{figure*} + \centering + \includegraphics[width=.95\textwidth]{img/bg_robustness.pdf} + \caption{Evaluation of background robustness on ImageNet + \schemename, ImageNet9 and CounterAnimal. + We plot the in-distribution (top of arrow) and the out-of-distribution (bottom of arrow) accuracy when training with and without \schemename. + We annotate each arrow with its length $\Delta$. + Training with \schemename improves the background robustness of all transformers by mostly boosting the out-of-distribution accuracy. + } + \label{fig:background-robustness} +\end{figure*} + +\textbf{Background Robustness.} +We assess the robustness of models to shifts in the background distribution from a class-related background to any background. +\Cref{fig:background-robustness} presents the background robustness results for three datasets: ImageNet with \schemename (all backgrounds vs. backgrounds of same class), ImageNet9 \cite{Xiao2020} (random backgrounds vs. original backgrounds), and CounterAnimal \cite{Wang2024f} (counter vs. common background). +The top triangle of each arrow represents the in-distribution backgrounds and the bottom triangle represents the out-of-distribution ones. +We follow ImageNet9 and CounterAnimal and assess the background robustness in terms of the accuracy gap when evaluating a model on images of normal background distribution compared to out-of-distribution backgrounds (length of each arrow; $\Delta$). +Crucially, \schemename improves the background robustness of all models and across datasets, reducing the background-gap by boosting the performance on the out-of-background-distribution samples more than the in-distribution ones. +These findings highlight the generalization benefits of \schemename to unusual image compositions. + +\begin{figure*} + \centering + \includegraphics[width=.95\textwidth]{img/fg_focus.pdf} + \caption{Evaluation of the foreground focus (\Cref{eq:fg-focus}) using GradCam, GradCam++ and IntegratedGradients (IG) of models trained on ImageNet. Training with \schemename improves the foreground focus of almost all models.} + \label{fig:foreground-focus} +\end{figure*} + +\textbf{Foreground Focus.} +Leveraging our inherent knowledge of the foreground masks when using \schemename, as well as common XAI techniques~\cite{Selvaraju2016,Chattopadhay2018,Sundararajan2017}, we can evaluate a model's focus on the foreground object. +We can directly evaluate ImageNet-trained models, but this technique can also be extended to other datasets without relying on manually annotated foreground masks. +To evaluate the foreground focus, we employ Grad-CAM \cite{Selvaraju2016}, Grad-CAM++ \cite{Chattopadhay2018} and IntegratedGradients (IG) \cite{Sundararajan2017} to compute the per-pixel importance of an image for the model's prediction. +The foreground focus is defined to be the ratio of the foreground's relative importance to its relative size in the image: +\begin{align} \label{eq:fg-focus} + \text{FG Focus}(\text{img}) = \frac{\text{Area}(\text{img}) \hspace{3pt} \text{Importance}(\text{fg})}{\text{Area}(\text{fg}) \hspace{3pt} \text{Importance}(\text{img})} +\end{align} +If all pixels uniformly receive the same importance value, the foreground focus is one. +The foreground focus of a model is its average focus over all test images. +\Cref{fig:foreground-focus} presents our findings. +Using \schemename significantly increases the foreground focus of ViT, DeiT and ResNet across all XAI metrics. +We hypothesize Swin's below-uniform foreground focus with GradCam is due to its specific implementation. + +\begin{table}[t] + \caption{ + Accuracy relative to the center accuracy of multiple instantiations of the models when the foreground objects is in different cells of a $3 \times 3$ grid. + We calculate center bias according to \Cref{eq:center-bias}. + Using \schemename significantly reduces models' center bias.} + \label{tab:center-bias} + \centering + \resizebox{.78\columnwidth}{!}{ + \begin{tabular}{lccc} + \toprule + \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\ + \cmidrule(lr){2-3} + & w/o \schemename & w/ \schemename \\ + \midrule + ViT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v3.pdf}} \\ + & $25.5\pm0.8$ & $22.0\pm0.3$ & \grntxt{$-3.5$} \\ + ViT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v3.pdf}} \\ + & $25.4\pm0.4$ & $19.0\pm0.2$ & \grntxt{$-6.4$} \\ + ViT-L & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v3.pdf}} \\ + & $24.3\pm1.1$ & $11.7\pm0.7$ & \grntxt{$-12.6$} \\ + \midrule + DeiT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v3.pdf}} \\ + & $20.4 \pm 0.2$ & $21.2 \pm 0.1$ & \gtxt{$+0.8$} \\ + DeiT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v3.pdf}} \\ + & $19.0 \pm 0.7$ & $19.0 \pm 0.2$ & \gtxt{$\pm0.0$} \\ + DeiT-L & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v3.pdf} } & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v3.pdf} } \\ + & $21.2 \pm 0.2$ & $18.0 \pm 0.2$ & \grntxt{$-3.2$} \\ + \midrule + Swin-Ti & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v3.pdf}} \\ + & $25.0\pm0.7$ & $16.5\pm0.2$ & \grntxt{$-8.5$} \\ + Swin-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v3.pdf}} \\ + & $23.2\pm0.1$ & $15.6\pm0.2$ & \grntxt{$-7.6$} \\ + \midrule + ResNet50 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v3.pdf}} \\ + & $26.3\pm0.3$ & $19.7\pm0.3$ & \grntxt{$-6.6$} \\ + ResNet101 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v3.pdf}} \\ + & $23.0\pm0.3$ & $19.9\pm0.2$ & \grntxt{$-3.1$} \\ + \bottomrule + \end{tabular} } + \includegraphics[width=.8\columnwidth]{img/colorbar_horizontal.pdf} +\end{table} + +\textbf{Center Bias.} +With \schemename we have unique control over the position of the foreground object in the image. +This lets us quantify the center bias of models trained with and without \schemename. +We divide the image into a $3 \times 3$ grid and evaluate model accuracy when the (scaled-down) foreground object is in each of the $9$ grid cells. +Each cell's accuracy is divided by the accuracy in the center cell for normalization, which gives us the relative performance drop when the foreground is in each part of the image. +The center bias is calculated as one minus the average of the minimum performance of a corner cell and the minimum performance of a side cell: +\begin{align} \label{eq:center-bias} + \text{Center Bias} = 1 - \frac{\min\limits_{c \in \text{sides}} \text{Acc}(c) + \min\limits_{c \in \text{corners}} \text{Acc}(c)}{2 \text{Acc}(c_\text{center})} +\end{align} +\Cref{tab:center-bias} visualizes the center bias of three instantiations of each model. +Performance is generally highest in the center and lowest in the four corners. +Interestingly, ImageNet-trained models perform slightly better when the foreground object is on the right side of the image, compared to the left side, despite our use of random flipping with a probability of $0.5$ during training. +Using \schemename significantly reduces center bias across models, with a more uniform performance especially across the middle row. +Thus, \schemename makes the model recognize objects across a wider spatial distribution, counteracting the center-bias of ImageNet. + +\begin{figure}[t!] + \centering + \includegraphics[width=\columnwidth]{img/size_bias_grid.pdf} + \caption{Evaluation of the size bias of models trained on ImageNet. We plot the accuracy relative to the accuracy when using the default size ($f_\text{size} = 1.0$).} + \label{fig:size-bias} +\end{figure} + +\textbf{Size Bias.} +Finally, we evaluate the impact of different sized foreground objects on the accuracy. +For this evaluation, we use the \emph{mean} foreground size strategy. +We introduce a size factor $f_\text{size}$ by which we additionally scale the foreground object before pasting it onto the background. +Results are normalized by the accuracy when using $f_\text{size} = 1.0$. +\Cref{fig:size-bias} shows the size bias curves of models trained with and without \schemename. +Models trained using \schemename maintain perform better, especially with smaller foreground objects. +Therefore, \schemename-training improves robustness to variations in object scale, especially for larger models. diff --git a/arxiv_v2_arXiv/sec/intro.tex b/arxiv_v2_arXiv/sec/intro.tex new file mode 100644 index 0000000..04d42b8 --- /dev/null +++ b/arxiv_v2_arXiv/sec/intro.tex @@ -0,0 +1,48 @@ + +\section{Introduction} +\label{sec:intro} + + +\begin{figure} + \centering + \includegraphics[width=\columnwidth]{img/fig-1.pdf} + \caption{Comparison of traditional image classification training and training when using \schemename. \schemename recombines foreground objects with different backgrounds each epoch, thus creating a more diverse training set. We still apply strong traditional data augmentation afterwards.} + \label{fig:fig-1} +\end{figure} + +Image classification, a fundamental task in computer vision (CV), involves assigning labels to images from a set of categories. +It underpins a wide range of applications, like medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2022b}, and object recognition~\cite{Carion2020,He2017,Girshick2013} and facilitates large-scale pretraining~\cite{Dosovitskiy2021,Liu2021,Touvron2021b}, and progress evaluation in CV~\cite{Khan2022, Rangel2024}. +The advent of large-scale datasets, particularly ImageNet~\cite{Deng2009}, served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}. +While traditionally, convolutional neural networks (CNNs) have been the go-to architecture in CV, Transformers \cite{Vaswani2017}, particularly the Vision Transformer (ViT) \cite{Dosovitskiy2021}, have emerged as a powerful alternative and go-to architecture, demonstrating +superior performance in various vision tasks, including image classification \cite{Wortsman2022,Yu2022,Carion2020,Zong2022,Wang2022a}. + + + +Data augmentation is a key technique for training image classification models. +Traditional augmentation methods, such as cropping, flipping, or color shifts, are commonly employed to increase data diversity~\cite{Xu2023d, Shorten2019}, but remain bound to existing image compositions. +While these preserve the images' semantic meaning, their ability to teach spatial invariances is limited. +While combinations of these data augmentations are still used today, they originally were proposed to benefit CNNs. +However, the architectural differences of CNNs and Transformers suggest that the latter might benefit from different data augmentation strategies. +In particular, the self-attention mechanism, unlike a CNN, is not translation equivariant~\cite{RojasGomez2023,Ding2023a}, meaning that the model is not designed to understand the spatial relationships between pixels. + +Recognizing that Transformers need to learn spatial relationships directly from data, +we propose \schemename, a data augmentation method that makes these relationships explicit by recombining foreground objects with diverse backgrounds. +Thus, \schemename goes beyond existing image compositions and encodes desired invariances directly into the training data (see \Cref{fig:fig-1}). +Applying \schemename to a dataset like ImageNet is a two-step process: +(1)~We separate the foreground objects in ImageNet from their backgrounds, using an open-world object detector~\cite{Ren2024} and fill in the background in a neutral way using an object removal model~\cite{Sun2024,Suvorov2021}. +(2)~This allows us to then recombine any foreground object with any background on the fly, creating a highly diverse training set. +By exploiting the control over foreground size and position during recombination, \schemename explicitly teaches spatial invariances that image classification models typically must learn implicitly. +We show that using \schemename additionally to strong traditional data augmentation increases the model accuracy of Transformers by up to 4.5 p.p. on ImageNet and reduces the error rate by up to $7.3$ p.p. in downstream tasks. + +Beyond training, \schemename becomes a diagnostic tool for analyzing model behavior and biases, when used during evaluation. +We utilize our control over the image distribution to measure a model's background robustness (by varying the choice of background), foreground focus (by leveraging our knowledge about the placement of the foreground object), center bias (by controlling position), and size bias (by controlling size). +These analyses provide valuable insights into model behavior and biases, which is crucial for model deployment and future robustness optimizations. +We show that training using \schemename significantly reduces all of these biases. +We make our code for \schemename and the output of \schemename's segmentation phase on ImageNet publicly available\footnote{Link will go here.} to facilitate further research. + +\subsection*{Contributions} +\begin{itemize} + \item We propose \schemename, a novel data augmentation scheme, that recombines objects and backgrounds. \schemename allows us to move beyond the (possibly biased) image compositions in the dataset while preserving label integrity. + \item We show that training a standard ViT using \schemename leads to up to 4.5 p.p. improved accuracy on ImageNet-1k and 7.3 p.p. on downstream tasks. + \item We propose novel \schemename-based metrics to analyze and quantify fine-grained biases of trained models: Background Robustness, Foreground Focus, Center Bias, and Size Bias. We show that \schemename significantly reduces these biases by encoding invariance that benefits ViT into the training data. +\end{itemize} diff --git a/arxiv_v2_arXiv/sec/method.tex b/arxiv_v2_arXiv/sec/method.tex new file mode 100644 index 0000000..d800419 --- /dev/null +++ b/arxiv_v2_arXiv/sec/method.tex @@ -0,0 +1,71 @@ + + +\section{\schemename (Method)} +\label{sec:method} + + +We introduce \schemename, a data augmentation designed to enhance Transformer training by embedding spatial invariances--which Transformers would otherwise need to learn implicitly--directly into the training data. +\schemename comprises two distinct stages: Segmentation and Recombination. Both stages are illustrated in \Cref{fig:method}. + + +\subsection{Segmentation} +\label{sec:segmentation} +The segmentation stage isolates the foreground objects and their corresponding backgrounds. +We then fill the background using a pretrained object-removal model, producing visually plausible~\cite{Sun2024}, neutral scenes ready for recombination. +This stage is computed once offline and the results are stored for the recombination stage. + +First, foreground objects are detected and segmented from their backgrounds using a prompt-based segmentation model to exploit the classification datasets labels. +We use the state-of-the-art Grounded SAM~\cite{Ren2024}, which is based on Grounding DINO~\cite{Liu2023e} and SAM~\cite{Kirillov2023}. +The prompt we use is ``\code{a , a type of }'', where \code{} is the specific name of the objects class as defined by the dataset and \code{} is a the broader category of the object. +The \code{} guides the segmentation model towards the correct object in case the \code{} alone is too specific. +This can be the case with prompts like ``sorrel'' or ``guenon'', where the more general name ``horse'' or ``monkey'' is more helpful. +We derive the \code{} from the WordNet hierarchy, using the immediate hypernym. + +We iteratively extract $n$ foreground masks for each dataset-image, creating prompts by going one hypernym up the WordNet-tree each step (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...). +Masks that are very similar, with a pairwise IoU of at least $0.9$, are merged. +The output is a set of masks delineating the foreground objects and the backgrounds. +We select the best mask per image (according to \Cref{eq:filtering-score}) in a later filtering step, described below. + +First, an inpainting model that is specifically optimized to remove objects from images, such as LaMa~\cite{Suvorov2021} or Attentive Eraser~\cite{Sun2024}, is used to inpaint the foreground regions in the backgrounds. +Then, to ensure the quality of the foregrounds and the neutral background images, we select a foreground/background pair (for each dataset-image) from the $\leq n$ variants we have extracted and infilled in the previous steps. +Using an ensemble $E$ of six ViT, ResNet, and Swin Transformer models pretrained on the original dataset, we select the foreground/background pair that maximizes foreground performance while minimizing the performance on the background and size of the foreground. +For each model $m \in E$, we predict the score of the ground truth class $c$ on the foreground $\mathrm{fg}$ and background $\mathrm{bg}$ and weigh these with the size $\operatorname{size}(\cdot)$ in number of pixels according to: +\begin{align} \begin{split} \label{eq:filtering-score} + \text{score}(\mathrm{fg}, \mathrm{bg}, c) &= \log \left( \frac{1}{\abs{E}} \sum_{m \in E} \P[m(\mathrm{fg}) = c] \right) \\ + & + \log \left( 1 - \frac{1}{\abs E} \sum_{m \in E} \P[m(\mathrm{bg}) = c] \right) \\ + & + \lambda \log \left( 1 - \abs{\frac{\operatorname{size}(\mathrm{fg})}{\operatorname{size}(\mathrm{bg})} - \eps} \right). + \end{split} \end{align} +We run a hyperparameter search using a manually annotated subset of foreground/background variants to find the factors in \Cref{eq:filtering-score}: $\lambda = 2$ and $\eps = 0.1$. + +Finally, we filter out backgrounds that are largely infilled, as these tend to be overly synthetic and do not carry much information (see the supplementary material). +Although the segmentation stage is computational overhead, it is a one-time cost with results that can be reused across experiments (see the supplementary material for details). +In summary, we factorize the dataset into a set of foreground objects with a transparent background and a set of diverse backgrounds per class. +The next step is to recombine these, before applying other common data augmentation operations during training. + +\subsection{Recombination} +\label{sec:recombination} +The recombination stage, performed online during training, combines the foreground objects with different backgrounds to create new training samples. +For each object, we follow the pipeline of: Pick an appropriate background, resize it to a fitting size, and place it in the background image. +Through this step, we expose the model to variations beyond the image compositions of the dataset. + +For each foreground object, we sample a background using one of the following strategies: +(1) the original image background, (2) the set of backgrounds from the same class, or (3) the set of all possible backgrounds. +These sets are trading off the amount of information the model can learn from the background against the diversity of new images created. +In each epoch, each foreground object is seen exactly once, but a background may appear multiple times. + +The selected foreground is resized based on its relative size within its original image and the relative size of the original foreground in the selected background image. +The final size is randomly selected from a 30\% range around upper and lower limits ($s_u$ and $s_l$), based on the original sizes. +To balance the size of the foreground and that of the backgrounds original foreground, the upper and lower limit $s_u$ and $s_l$ are set to the mean or range of both sizes, depending on the foreground size strategy: \emph{mean} or \emph{range}. + +The resized foreground is then placed at a random position within the background image. +To more seamlessly integrate the foreground, we apply a Gaussian blur with ${\sigma \in [\frac{\sigma_{\text{max}}}{10}, \sigma_{\text{max}}]}$, inspired by the standard range for the Gaussian blur operation in \cite{Touvron2022}, to the foreground's alpha-mask. + +We can apply standard data augmentation techniques in two modes: +Either we apply all augmentations to the recombined image, or we apply the cropping and resizing to the background only and then apply the other augmentations after recombination. +The first mode mirrors standard augmentation practice, whereas the second one ensures the foreground object remains fully visible. + +We experiment with a constant mixing ratio, or a linear or cosine annealing schedule that increases the amount of images from the original dataset over time. +The mixing ratio acts as a probability of selecting an image from the original dataset; +otherwise, an image with the same foreground is recombined using \schemename, ensuring each object is seen once per epoch. +The recombination stage is designed to be parallelized on the CPU during training and thus does not impact training time (see supplementary material for details). + diff --git a/arxiv_v2_arXiv/sec/related_work.tex b/arxiv_v2_arXiv/sec/related_work.tex new file mode 100644 index 0000000..0aacebe --- /dev/null +++ b/arxiv_v2_arXiv/sec/related_work.tex @@ -0,0 +1,40 @@ + +\section{Related Work} +\label{sec:related_work} + +\paragraph{Data Augmentation for Image Classification} +Data augmentation is a crucial technique for improving the performance and generalization of image classification models. +Traditional augmentation strategies rely on simple geometric or color-space transformations like cropping, flipping, roatation, blurring, color jittering, or random erasing \cite{Zhong2017} to increase the diversity of the training data without changing their semantic meaning. +With the advent of Vision Transformers, new data augmentation operations like PatchDropout \cite{Liu2022d} have been proposed. +Other transformations like Mixup \cite{Zhang2018a}, CutMix \cite{Yun2019}, or random cropping and patching \cite{Takahashi2018} combine multiple input images. +These simple transformations are usually bundled to form more complex augmentation policies like AutoAugment \cite{Cubuk2018} and RandAugment \cite{Cubuk2019}, +or 3-augment \cite{Touvron2022} which is optimized to train a ViT. +For a general overview of data augmentation techniques for image classification, we refer to \citet{Shorten2019, Xu2023d}. + +We build upon these general augmentations by introducing a novel approach to explicitly separate objects and backgrounds for image classification, allowing us to -- unlike these basic transformations -- move beyond dataset image compositions. +Our approach is used additionally to strong traditional techniques to improve performance and reduce biases. + +\paragraph{Copy-Paste Augmentation} +The copy-paste augmentation \cite{Ghiasi2020}, which is used only for object detection \cite{Shermaine2025,Ghiasi2020} and instance segmentation \cite{Werman2021,Ling2022}, involves copying segmented objects from one image and pasting them onto another. +While typically human annotated segmentation masks are used to extract the foreground objects, other foregound sources have been explored, like 3D models \cite{Hinterstoisser2019} and pretrained object-detection models for use on objects on white background \cite{Dwibedi2017} or synthetic images \cite{Ge2023}. +\cite{Kang2022} apply copy-paste as an alternative to CutMix in image classification, but they do not shift the size or position of the foregrounds and use normal dataset images as backgrounds. + +Unlike prior copy-paste methods that overlay objects, \schemename extracts foregrounds and replaces their backgrounds with semantically neutral fills, thereby preserving label integrity while enabling controlled and diverse recombination. + +\begin{figure*}[ht!] + \centering + \includegraphics[width=.9\textwidth]{img/fig-2.pdf} + \caption{Overview of \schemename. The data creation consists of two stages: Segmentation (offline, \Cref{sec:segmentation}), where we segment the foreground objects from the background and fill in the background. Recombination (online, \Cref{sec:recombination}), where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.} + \label{fig:method} +\end{figure*} + +\paragraph{Model robustness evaluation} +Evaluating model robustness to various image variations is critical for understanding and improving model generalization. +Datasets like ImageNet-C \cite{Hendrycks2019} and ImageNet-P \cite{Hendrycks2019} introduce common corruptions and perturbations. +ImageNet-E \cite{Li2023e} evaluates model robustness against a collection of distribution shifts. +Other datasets, such as ImageNet-D \cite{Zhang2024f}, focus on varying background, texture, and material, but rely on synthetic data. +Stylized ImageNet \cite{Geirhos2018} investigates the impact of texture changes. +ImageNet-9 \cite{Xiao2020} explores background variations using segmented images, but backgrounds are often artificial. + +In contrast to these existing datasets, which are used only for evaluation, \schemename provides fine-grained control over foreground object placement, size, and background selection, enabling a precise and comprehensive analysis of specific model biases within the context of a large-scale, real-world image distribution. +As \schemename also provides controllable training set generation, it goes beyond simply measuring robustness to actively improving it through training. diff --git a/cvpr.sty b/cvpr.sty new file mode 100644 index 0000000..6ce0bd8 --- /dev/null +++ b/cvpr.sty @@ -0,0 +1,508 @@ +% --------------------------------------------------------------- +% +% No guarantee is given that the format corresponds perfectly to +% IEEE 8.5" x 11" Proceedings, but most features should be ok. +% +% --------------------------------------------------------------- +% with LaTeX2e: +% ============= +% +% use as +% \documentclass[times,10pt,twocolumn]{article} +% \usepackage[options]{cvpr} +% \usepackage{times} +% +% "options" should be replaced by +% * "review" for submitting a paper for review, +% * "final" for the camera ready, and +% * "rebuttal" for the author rebuttal. +% +% specify references as +% {\small +% \bibliographystyle{ieee} +% \bibliography{...your files...} +% } +% --------------------------------------------------------------- + +\NeedsTeXFormat{LaTeX2e}[1999/12/01] +\ProvidesPackage{cvpr}[2026 LaTeX class for IEEE CVPR] + +\RequirePackage{times} % Integrate Times for here +\RequirePackage{xspace} +\RequirePackage[dvipsnames]{xcolor} +\RequirePackage{graphicx} +\RequirePackage{amsmath} +\RequirePackage{amssymb} +\RequirePackage{booktabs} +\RequirePackage[numbers,sort&compress]{natbib} +\setlength{\bibsep}{1pt plus 1pt minus 1pt} + +\RequirePackage{silence} % Suppress unwanted warnings +\hbadness=10000 \vbadness=10000 \vfuzz=30pt \hfuzz=30pt +\WarningFilter{latexfont}{Font shape declaration} +\WarningFilter{latex}{Font shape} +\WarningFilter[rebuttal]{latex}{No \author given} +\RequirePackage{etoolbox} + +% Use modern caption package to allow for sub-figures etc. +% Reproduces the original CVPR/ICCV style as closely as possible. +\RequirePackage[format=plain,labelformat=simple,labelsep=period,font=small,compatibility=false]{caption} +\RequirePackage[font=footnotesize,skip=3pt,subrefformat=parens]{subcaption} + + +\newtoggle{cvprfinal} % Camera-ready version +\newtoggle{cvprrebuttal} % Rebuttal +\newtoggle{cvprpagenumbers} % Force page numbers (in camera ready) +\toggletrue{cvprfinal} +\togglefalse{cvprrebuttal} +\togglefalse{cvprpagenumbers} +\DeclareOption{review}{\togglefalse{cvprfinal}\toggletrue{cvprpagenumbers}} +\DeclareOption{rebuttal}{\togglefalse{cvprfinal}\toggletrue{cvprrebuttal}} +\DeclareOption{pagenumbers}{\toggletrue{cvprpagenumbers}} +\DeclareOption*{\PackageWarning{cvpr}{Unkown option `\CurrentOption'}} +\ProcessOptions\relax + +% Don't warn about missing author for rebuttal +\iftoggle{cvprrebuttal}{% + \ActivateWarningFilters[rebuttal] +}{} + +% Breaking lines for URLs in the bib +\RequirePackage[hyphens]{url} +\Urlmuskip=0mu plus 1mu\relax + + +% --------------------------------------------------------------- +% Inlined version of the obsolete "everyshi-2001-05-15" package. +\newcommand{\@EveryShipout@Hook}{} +\newcommand{\@EveryShipout@AtNextHook}{} +\newcommand*{\EveryShipout}[1] + {\g@addto@macro\@EveryShipout@Hook{#1}} +\newcommand*{\AtNextShipout}[1] + {\g@addto@macro\@EveryShipout@AtNextHook{#1}} +\newcommand{\@EveryShipout@Shipout}{% + \afterassignment\@EveryShipout@Test + \global\setbox\@cclv= % + } +\newcommand{\@EveryShipout@Test}{% + \ifvoid\@cclv\relax + \aftergroup\@EveryShipout@Output + \else + \@EveryShipout@Output + \fi% + } +\newcommand{\@EveryShipout@Output}{% + \@EveryShipout@Hook% + \@EveryShipout@AtNextHook% + \gdef\@EveryShipout@AtNextHook{}% + \@EveryShipout@Org@Shipout\box\@cclv% + } +\newcommand{\@EveryShipout@Org@Shipout}{} +\newcommand*{\@EveryShipout@Init}{% + \message{ABD: EveryShipout initializing macros}% + \let\@EveryShipout@Org@Shipout\shipout + \let\shipout\@EveryShipout@Shipout + } +\AtBeginDocument{\@EveryShipout@Init} +% --------------------------------------------------------------- + + +% --------------------------------------------------------------- +% Inlined simplified version of the "eso-pic" package. +\newcommand\LenToUnit[1]{#1\@gobble} +\newcommand\AtPageUpperLeft[1]{% + \begingroup + \@tempdima=0pt\relax\@tempdimb=\ESO@yoffsetI\relax + \put(\LenToUnit{\@tempdima},\LenToUnit{\@tempdimb}){#1}% + \endgroup +} +\newcommand\AtPageLowerLeft[1]{\AtPageUpperLeft{% + \put(0,\LenToUnit{-\paperheight}){#1}}} +\newcommand\AtPageCenter[1]{\AtPageUpperLeft{% + \put(\LenToUnit{.5\paperwidth},\LenToUnit{-.5\paperheight}){#1}}% +} +\newcommand\AtTextUpperLeft[1]{% + \begingroup + \setlength\@tempdima{1in}% + \ifodd\c@page% + \advance\@tempdima\oddsidemargin% + \else% + \advance\@tempdima\evensidemargin% + \fi% + \@tempdimb=\ESO@yoffsetI\relax\advance\@tempdimb-1in\relax% + \advance\@tempdimb-\topmargin% + \advance\@tempdimb-\headheight\advance\@tempdimb-\headsep% + \put(\LenToUnit{\@tempdima},\LenToUnit{\@tempdimb}){#1}% + \endgroup +} +\newcommand\AtTextLowerLeft[1]{\AtTextUpperLeft{% + \put(0,\LenToUnit{-\textheight}){#1}}} +\newcommand\AtTextCenter[1]{\AtTextUpperLeft{% + \put(\LenToUnit{.5\textwidth},\LenToUnit{-.5\textheight}){#1}}} +\newcommand{\ESO@HookI}{} \newcommand{\ESO@HookII}{} +\newcommand{\ESO@HookIII}{} +\newcommand{\AddToShipoutPicture}{% + \@ifstar{\g@addto@macro\ESO@HookII}{\g@addto@macro\ESO@HookI}} +\newcommand{\ClearShipoutPicture}{\global\let\ESO@HookI\@empty} +\newcommand\ESO@isMEMOIR[1]{} +\@ifclassloaded{memoir}{\renewcommand\ESO@isMEMOIR[1]{#1}}{} +\newcommand{\@ShipoutPicture}{% + \bgroup + \@tempswafalse% + \ifx\ESO@HookI\@empty\else\@tempswatrue\fi% + \ifx\ESO@HookII\@empty\else\@tempswatrue\fi% + \ifx\ESO@HookIII\@empty\else\@tempswatrue\fi% + \if@tempswa% + \@tempdima=1in\@tempdimb=-\@tempdima% + \advance\@tempdimb\ESO@yoffsetI% + \ESO@isMEMOIR{% + \advance\@tempdima\trimedge% + \advance\@tempdima\paperwidth% + \advance\@tempdima-\stockwidth% + \if@twoside\ifodd\c@page\else% + \advance\@tempdima-2\trimedge% + \advance\@tempdima-\paperwidth% + \advance\@tempdima\stockwidth% + \fi\fi% + \advance\@tempdimb\trimtop}% + \unitlength=1pt% + \global\setbox\@cclv\vbox{% + \vbox{\let\protect\relax + \pictur@(0,0)(\strip@pt\@tempdima,\strip@pt\@tempdimb)% + \ESO@HookIII\ESO@HookI\ESO@HookII% + \global\let\ESO@HookII\@empty% + \endpicture}% + \nointerlineskip% + \box\@cclv}% + \fi + \egroup +} +\EveryShipout{\@ShipoutPicture} +\RequirePackage{keyval} +\newif\ifESO@dvips\ESO@dvipsfalse +\newif\ifESO@texcoord\ESO@texcoordfalse + +\AtBeginDocument{% + \IfFileExists{color.sty} + {% + \RequirePackage{color} + \let\ESO@color=\color\let\ESO@colorbox=\colorbox + \let\ESO@fcolorbox=\fcolorbox + }{} + \@ifundefined{Gin@driver}{}% + {% + \ifx\Gin@driver\@empty\else% + \filename@parse{\Gin@driver}\def\reserved@a{dvips}% + \ifx\filename@base\reserved@a\ESO@dvipstrue\fi% + \fi + }% + \ifx\pdfoutput\undefined\else + \ifx\pdfoutput\relax\else + \ifcase\pdfoutput\else + \ESO@dvipsfalse% + \fi + \fi + \fi +} +\ifESO@texcoord + \def\ESO@yoffsetI{0pt}\def\ESO@yoffsetII{-\paperheight} +\else + \def\ESO@yoffsetI{\paperheight}\def\ESO@yoffsetII{0pt} +\fi +% --------------------------------------------------------------- + + +\typeout{CVPR 8.5 x 11-Inch Proceedings Style `cvpr.sty'.} + +% ten point helvetica bold required for captions +% eleven point times bold required for second-order headings +% in some sites the name of the fonts may differ, +% change the name here: +\font\cvprtenhv = phvb at 8pt % *** IF THIS FAILS, SEE cvpr.sty *** +\font\elvbf = ptmb scaled 1100 +\font\tenbf = ptmb scaled 1000 + +% If the above lines give an error message, try to comment them and +% uncomment these: +%\font\cvprtenhv = phvb7t at 8pt +%\font\elvbf = ptmb7t scaled 1100 +%\font\tenbf = ptmb7t scaled 1000 + +% set dimensions of columns, gap between columns, and paragraph indent +\setlength{\textheight}{8.875in} +\setlength{\textwidth}{6.875in} +\setlength{\columnsep}{0.3125in} +\setlength{\topmargin}{0in} +\setlength{\headheight}{0in} +\setlength{\headsep}{0in} +\setlength{\parindent}{1pc} +\setlength{\oddsidemargin}{-0.1875in} +\setlength{\evensidemargin}{-0.1875in} + + +% Suppress page numbers when the appropriate option is given +\iftoggle{cvprpagenumbers}{}{% + \pagestyle{empty} +} + +\AtBeginDocument{% + % Print an error if document class other than article is used + \@ifclassloaded{article}{}{% + \PackageError{cvpr}{Package only meant to be used with document class `article'}{Change document class to `article'.} + } + % Print a warning if incorrect options for article are specified + \@ifclasswith{article}{10pt}{}{% + \PackageWarningNoLine{cvpr}{Incorrect font size specified - CVPR requires 10-point fonts. Please load document class `article' with `10pt' option} + } + \@ifclasswith{article}{twocolumn}{}{% + \PackageWarningNoLine{cvpr}{Single column document - CVPR requires papers to have two-column layout. Please load document class `article' with `twocolumn' option} + } + \@ifclasswith{article}{letterpaper}{}{% + \PackageWarningNoLine{cvpr}{Incorrect paper size - CVPR uses paper size `letter'. Please load document class `article' with `letterpaper' option} + } + % Print a warning if hyperref is not loaded and/or if the pagebackref option is missing + \iftoggle{cvprfinal}{% + \@ifpackageloaded{hyperref}{}{% + \PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded, but highly recommended for camera-ready version} + } + }{% + \@ifpackageloaded{hyperref}{ + \@ifpackagewith{hyperref}{pagebackref}{}{ + \PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded with option `pagebackref', which is strongly recommended for review version} + } + }{% + \PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded, but strongly recommended for review version} + } + } +} + +\def\@maketitle{ + \newpage + \null + \iftoggle{cvprrebuttal}{\vspace*{-.3in}}{\vskip .375in} + \begin{center} + % smaller title font only for rebuttal + \iftoggle{cvprrebuttal}{{\large \bf \@title \par}}{{\Large \bf \@title \par}} + % additional two empty lines at the end of the title + \iftoggle{cvprrebuttal}{\vspace*{-22pt}}{\vspace*{24pt}}{ + \large + \lineskip .5em + \begin{tabular}[t]{c} + \iftoggle{cvprfinal}{ + \@author + }{ + \iftoggle{cvprrebuttal}{}{ + Anonymous \confName~submission\\ + \vspace*{1pt}\\ + Paper ID \paperID + } + } + \end{tabular} + \par + } + % additional small space at the end of the author name + \vskip .5em + % additional empty line at the end of the title block + \vspace*{12pt} + \end{center} +} + +\def\abstract{% + % Suppress page numbers when the appropriate option is given + \iftoggle{cvprpagenumbers}{}{% + \thispagestyle{empty} + } + \centerline{\large\bf Abstract}% + \vspace*{12pt}\noindent% + \it\ignorespaces% +} + +\def\endabstract{% + % additional empty line at the end of the abstract + \vspace*{12pt} + } + +\def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{} + +% correct heading spacing and type +\def\cvprsection{\@startsection {section}{1}{\z@} + {-10pt plus -2pt minus -2pt}{7pt} {\large\bf}} +\def\cvprssect#1{\cvprsection*{#1}} +\def\cvprsect#1{\cvprsection{\texorpdfstring{\hskip -1em.~}{}#1}} +\def\section{\@ifstar\cvprssect\cvprsect} + +\def\cvprsubsection{\@startsection {subsection}{2}{\z@} + {-8pt plus -2pt minus -2pt}{5pt} {\elvbf}} +\def\cvprssubsect#1{\cvprsubsection*{#1}} +\def\cvprsubsect#1{\cvprsubsection{\texorpdfstring{\hskip -1em.~}{}#1}} +\def\subsection{\@ifstar\cvprssubsect\cvprsubsect} + +\def\cvprsubsubsection{\@startsection {subsubsection}{3}{\z@} + {-6pt plus -2pt minus -2pt}{3pt} {\tenbf}} +\def\cvprssubsubsect#1{\cvprsubsubsection*{#1}} +\def\cvprsubsubsect#1{\cvprsubsubsection{\texorpdfstring{\hskip -1em.~}{}#1}} +\def\subsubsection{\@ifstar\cvprssubsubsect\cvprsubsubsect} + +%% --------- Page background marks: Ruler and confidentiality (only for review and rebuttal) +\iftoggle{cvprfinal}{ + % In review and rebuttal mode, we use the "lineno" package for numbering lines. + % When switching to a different mode, the "\@LN" macro may remain in cached .aux files, + % leading to build errors (https://github.com/cvpr-org/author-kit/issues/49). + % Defining the macro as empty fixes that (https://tex.stackexchange.com/a/125779). + \makeatletter + \providecommand{\@LN}[2]{} + \makeatother +}{ + % ----- define vruler + \makeatletter + \newbox\cvprrulerbox + \newcount\cvprrulercount + \newdimen\cvprruleroffset + \newdimen\cv@lineheight + \newdimen\cv@boxheight + \newbox\cv@tmpbox + \newcount\cv@refno + \newcount\cv@tot + % NUMBER with left flushed zeros \fillzeros[] + \newcount\cv@tmpc@ \newcount\cv@tmpc + \def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi + \cv@tmpc=1 % + \loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi + \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat + \ifnum#2<0\advance\cv@tmpc1\relax-\fi + \loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat + \cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}% + \makeatother + % ----- end of vruler + + %% Define linenumber setup + \RequirePackage[switch,mathlines]{lineno} + + % Line numbers in CVPR blue using font from \cvprtenhv + \renewcommand\linenumberfont{\cvprtenhv\color[rgb]{.5,.5,1}} + + \renewcommand\thelinenumber{\fillzeros[3]{\arabic{linenumber}}} + + \setlength{\linenumbersep}{.75cm} + + % Bug: An equation with $$ ... $$ isn't numbered, nor is the previous line. + + % Patch amsmath commands so that the previous line and the equation itself + % are numbered. Bug: multiline has an extra line number. + % https://tex.stackexchange.com/questions/461186/how-to-use-lineno-with-amsmath-align + \RequirePackage{etoolbox} %% <- for \pretocmd, \apptocmd and \patchcmd + + \newcommand*\linenomathpatch[1]{% + \expandafter\pretocmd\csname #1\endcsname {\linenomath}{}{}% + \expandafter\pretocmd\csname #1*\endcsname {\linenomath}{}{}% + \expandafter\apptocmd\csname end#1\endcsname {\endlinenomath}{}{}% + \expandafter\apptocmd\csname end#1*\endcsname {\endlinenomath}{}{}% + } + \newcommand*\linenomathpatchAMS[1]{% + \expandafter\pretocmd\csname #1\endcsname {\linenomathAMS}{}{}% + \expandafter\pretocmd\csname #1*\endcsname {\linenomathAMS}{}{}% + \expandafter\apptocmd\csname end#1\endcsname {\endlinenomath}{}{}% + \expandafter\apptocmd\csname end#1*\endcsname {\endlinenomath}{}{}% + } + + %% Definition of \linenomathAMS depends on whether the mathlines option is provided + \expandafter\ifx\linenomath\linenomathWithnumbers + \let\linenomathAMS\linenomathWithnumbers + %% The following line gets rid of an extra line numbers at the bottom: + \patchcmd\linenomathAMS{\advance\postdisplaypenalty\linenopenalty}{}{}{} + \else + \let\linenomathAMS\linenomathNonumbers + \fi + + % Add the numbers + \linenumbers + \AtBeginDocument{% + \linenomathpatch{equation}% + \linenomathpatchAMS{gather}% + \linenomathpatchAMS{multline}% + \linenomathpatchAMS{align}% + \linenomathpatchAMS{alignat}% + \linenomathpatchAMS{flalign}% + } + + % \makevruler[][][][][] + \def\cvprruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\cvprrulerbox}} + \AddToShipoutPicture{% + \color[rgb]{.5,.5,1} + + \def\pid{\parbox{1in}{\begin{center}\bf\sf{\small \confName}\\\small \#\paperID\end{center}}} + \AtTextUpperLeft{%paperID in corners + \put(\LenToUnit{-65pt},\LenToUnit{45pt}){\pid} + \put(\LenToUnit{\textwidth-12pt},\LenToUnit{45pt}){\pid} + } + \AtTextUpperLeft{%confidential + \put(0,\LenToUnit{1cm}){\parbox{\textwidth}{\centering\cvprtenhv + \confName~\confYear~Submission \#\paperID. CONFIDENTIAL REVIEW COPY. DO NOT DISTRIBUTE.}} + } + } +} % end of not cvprfinal + +%%% Make figure placement a little more predictable. +% We trust the user to move figures if this results +% in ugliness. +% Minimize bad page breaks at figures +\renewcommand{\textfraction}{0.01} +\renewcommand{\floatpagefraction}{0.99} +\renewcommand{\topfraction}{0.99} +\renewcommand{\bottomfraction}{0.99} +\renewcommand{\dblfloatpagefraction}{0.99} +\renewcommand{\dbltopfraction}{0.99} +\setcounter{totalnumber}{99} +\setcounter{topnumber}{99} +\setcounter{bottomnumber}{99} + +% Add a period to the end of an abbreviation unless there's one +% already, then \xspace. +\makeatletter +\DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot} +\def\@onedot{\ifx\@let@token.\else.\null\fi\xspace} + +\def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot} +\def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot} +\def\cf{\emph{cf}\onedot} \def\Cf{\emph{Cf}\onedot} +\def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot} +\def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot} +\def\iid{i.i.d\onedot} \def\wolog{w.l.o.g\onedot} +\def\etal{\emph{et al}\onedot} +\makeatother + +% --------------------------------------------------------------- + +%% redefine the \title command so that a variable name is saved in \thetitle, and provides the \maketitlesupplementary command +\let\titleold\title +\renewcommand{\title}[1]{\titleold{#1}\newcommand{\thetitle}{#1}} +\def\maketitlesupplementary + { + \newpage + \twocolumn[ + \centering + \Large + \textbf{\thetitle}\\ + \vspace{0.5em}Supplementary Material \\ + \vspace{1.0em} + ] %< twocolumn + } + +% --------------------------------------------------------------- + +%% Support for easy cross-referencing (e.g. \cref{sec:intro} +% configured with \AtEndPreamble as it needs to be called after hyperref +\AtEndPreamble{ + \usepackage[capitalize]{cleveref} + \crefname{section}{Sec.}{Secs.} + \Crefname{section}{Section}{Sections} + \Crefname{table}{Table}{Tables} + \crefname{table}{Tab.}{Tabs.} +} + +% --------------------------------------------------------------- + +%% More compact compact itemize/enumeration (e.g. list contributions) +\RequirePackage[shortlabels,inline]{enumitem} +\setlist[itemize]{noitemsep,leftmargin=*,topsep=0em} +\setlist[enumerate]{noitemsep,leftmargin=*,topsep=0em} diff --git a/img/DeiT-B_ImageNet_v2.pdf b/img/DeiT-B_ImageNet_v2.pdf new file mode 100644 index 0000000..ac76a2f Binary files /dev/null and b/img/DeiT-B_ImageNet_v2.pdf differ diff --git a/img/DeiT-B_ImageNet_v3.pdf b/img/DeiT-B_ImageNet_v3.pdf new file mode 100644 index 0000000..da18456 --- /dev/null +++ b/img/DeiT-B_ImageNet_v3.pdf @@ -0,0 +1,71 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMA >Oc%4GN2 b"rR=>}{<~:'0> 72 +pHf +peŗ#~HKD)"ִ -a*Hha/gD_0=UOKձz hbQ8Sn¥YUN9+K"7P#yUIa~[;@C27R-̫paiFpfR*QN9f YϒQpS>lfngH5jEZ-ZRG;J)'vζIfd367\Zem +{5IXl(a\o~ǎ3Qʹ9bwqE#VN9m؁،<__5'58G +endstream +endobj +12 0 obj +420 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073347+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000970 00000 n +0000000865 00000 n +0000000886 00000 n +0000000907 00000 n +0000000928 00000 n +0000000949 00000 n +0000000065 00000 n +0000000330 00000 n +0000000845 00000 n +0000000208 00000 n +0000000825 00000 n +0000001030 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1187 +%%EOF diff --git a/img/DeiT-B_ImageNet_vNone.pdf b/img/DeiT-B_ImageNet_vNone.pdf new file mode 100644 index 0000000..28816a9 Binary files /dev/null and b/img/DeiT-B_ImageNet_vNone.pdf differ diff --git a/img/DeiT-B_fornet_all_cos_0.8_2.0_v3.pdf b/img/DeiT-B_fornet_all_cos_0.8_2.0_v3.pdf new file mode 100644 index 0000000..5154597 --- /dev/null +++ b/img/DeiT-B_fornet_all_cos_0.8_2.0_v3.pdf @@ -0,0 +1,71 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn@ <OC9&nMH +$//B<{|Cc?|H{d<!/m X l>|?/|EڅY,iᚔi"X0sp҂Zԛp夘(qe/vf2IF؁ 5DXNŅSĕ.f^ˋiR͉e+a˝hbjtKlE&Wz 񨙲s !\ +{˺ѰXJRwKx_7EIo% !ls' ;mL[}5BkUͮmj[.9RJ({˺m$kנa-Q+L= u%l/ov$Qgȥ_5{9WQr2_m +endstream +endobj +12 0 obj +432 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150053+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000982 00000 n +0000000877 00000 n +0000000898 00000 n +0000000919 00000 n +0000000940 00000 n +0000000961 00000 n +0000000065 00000 n +0000000330 00000 n +0000000857 00000 n +0000000208 00000 n +0000000837 00000 n +0000001042 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1201 +%%EOF diff --git a/img/DeiT-B_fornet_all_cos_0.8_2.0_vNone.pdf b/img/DeiT-B_fornet_all_cos_0.8_2.0_vNone.pdf new file mode 100644 index 0000000..fc2eb70 Binary files /dev/null and b/img/DeiT-B_fornet_all_cos_0.8_2.0_vNone.pdf differ diff --git a/img/DeiT-B_fornet_all_cos_v1.pdf b/img/DeiT-B_fornet_all_cos_v1.pdf new file mode 100644 index 0000000..0abb5b6 Binary files /dev/null and b/img/DeiT-B_fornet_all_cos_v1.pdf differ diff --git a/img/DeiT-B_fornet_all_cos_v2.pdf b/img/DeiT-B_fornet_all_cos_v2.pdf new file mode 100644 index 0000000..06cec15 --- /dev/null +++ b/img/DeiT-B_fornet_all_cos_v2.pdf @@ -0,0 +1,71 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xn0 E +~CJ|nn3q@di^YZ^ç/+<0 @H sx;~b͇{/HKZ2"-ƚliHkdp|="wp%MID4J %E!F +Q#-jL UrC,s+9 CRZ!V](ecQr^ENTH;w"HKM j&n[w˨iۣ$\6V +.ځ}<_/3'n{Qpx^Uªk2µp@sgIN]%;i[gm4{ ROlzXi5~[ 7ʁ]/nؓil8:6Y,~C׺]-?jN>18g #9 +endstream +endobj +12 0 obj +429 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073409+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000979 00000 n +0000000874 00000 n +0000000895 00000 n +0000000916 00000 n +0000000937 00000 n +0000000958 00000 n +0000000065 00000 n +0000000330 00000 n +0000000854 00000 n +0000000208 00000 n +0000000834 00000 n +0000001039 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1196 +%%EOF diff --git a/img/DeiT-B_fornet_all_cos_v3.pdf b/img/DeiT-B_fornet_all_cos_v3.pdf new file mode 100644 index 0000000..ddf3280 Binary files /dev/null and b/img/DeiT-B_fornet_all_cos_v3.pdf differ diff --git a/img/DeiT-B_fornet_all_linear_v1.pdf b/img/DeiT-B_fornet_all_linear_v1.pdf new file mode 100644 index 0000000..c524b6f Binary files /dev/null and b/img/DeiT-B_fornet_all_linear_v1.pdf differ diff --git a/img/DeiT-L_ImageNet_v1.pdf b/img/DeiT-L_ImageNet_v1.pdf new file mode 100644 index 0000000..8785a89 Binary files /dev/null and b/img/DeiT-L_ImageNet_v1.pdf differ diff --git a/img/DeiT-L_ImageNet_v2.pdf b/img/DeiT-L_ImageNet_v2.pdf new file mode 100644 index 0000000..d20839e Binary files /dev/null and b/img/DeiT-L_ImageNet_v2.pdf differ diff --git a/img/DeiT-L_ImageNet_v3.pdf b/img/DeiT-L_ImageNet_v3.pdf new file mode 100644 index 0000000..8a83e3f --- /dev/null +++ b/img/DeiT-L_ImageNet_v3.pdf @@ -0,0 +1,73 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xn@ E +~Br8|l^];mh +_C-Eq5ç/+<30>!3A2>! +/}m \'6iI,U)+RUH -irG_4% n h L)ܼhambF*a:$m 7Pggw6ޭP2sҢ"b> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073347+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000991 00000 n +0000000886 00000 n +0000000907 00000 n +0000000928 00000 n +0000000949 00000 n +0000000970 00000 n +0000000065 00000 n +0000000330 00000 n +0000000866 00000 n +0000000208 00000 n +0000000846 00000 n +0000001051 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1208 +%%EOF diff --git a/img/DeiT-L_ImageNet_vNone.pdf b/img/DeiT-L_ImageNet_vNone.pdf new file mode 100644 index 0000000..6cdb23c Binary files /dev/null and b/img/DeiT-L_ImageNet_vNone.pdf differ diff --git a/img/DeiT-L_fornet_all_cos_v1.pdf b/img/DeiT-L_fornet_all_cos_v1.pdf new file mode 100644 index 0000000..26efb4b Binary files /dev/null and b/img/DeiT-L_fornet_all_cos_v1.pdf differ diff --git a/img/DeiT-L_fornet_all_cos_v2.pdf b/img/DeiT-L_fornet_all_cos_v2.pdf new file mode 100644 index 0000000..52779b7 Binary files /dev/null and b/img/DeiT-L_fornet_all_cos_v2.pdf differ diff --git a/img/DeiT-L_fornet_all_cos_v3.pdf b/img/DeiT-L_fornet_all_cos_v3.pdf new file mode 100644 index 0000000..3bbace3 Binary files /dev/null and b/img/DeiT-L_fornet_all_cos_v3.pdf differ diff --git a/img/DeiT-S_ImageNet_v2.pdf b/img/DeiT-S_ImageNet_v2.pdf new file mode 100644 index 0000000..d1e7762 Binary files /dev/null and b/img/DeiT-S_ImageNet_v2.pdf differ diff --git a/img/DeiT-S_ImageNet_v3.pdf b/img/DeiT-S_ImageNet_v3.pdf new file mode 100644 index 0000000..aee0d98 Binary files /dev/null and b/img/DeiT-S_ImageNet_v3.pdf differ diff --git a/img/DeiT-S_ImageNet_vNone.pdf b/img/DeiT-S_ImageNet_vNone.pdf new file mode 100644 index 0000000..ea24781 Binary files /dev/null and b/img/DeiT-S_ImageNet_vNone.pdf differ diff --git a/img/DeiT-S_fornet_all_cos_v1.pdf b/img/DeiT-S_fornet_all_cos_v1.pdf new file mode 100644 index 0000000..7927f34 Binary files /dev/null and b/img/DeiT-S_fornet_all_cos_v1.pdf differ diff --git a/img/DeiT-S_fornet_all_cos_v2.pdf b/img/DeiT-S_fornet_all_cos_v2.pdf new file mode 100644 index 0000000..40a0e98 Binary files /dev/null and b/img/DeiT-S_fornet_all_cos_v2.pdf differ diff --git a/img/DeiT-S_fornet_all_cos_v3.pdf b/img/DeiT-S_fornet_all_cos_v3.pdf new file mode 100644 index 0000000..f5ce2d3 Binary files /dev/null and b/img/DeiT-S_fornet_all_cos_v3.pdf differ diff --git a/img/DeiT-S_fornet_all_cos_vNone.pdf b/img/DeiT-S_fornet_all_cos_vNone.pdf new file mode 100644 index 0000000..42927e9 Binary files /dev/null and b/img/DeiT-S_fornet_all_cos_vNone.pdf differ diff --git a/img/DeiT-S_fornet_all_linear_v1.pdf b/img/DeiT-S_fornet_all_linear_v1.pdf new file mode 100644 index 0000000..17d723d --- /dev/null +++ b/img/DeiT-S_fornet_all_linear_v1.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xAn1 E<O0!%J$M.rvx$$8i7] 8}y˧ǟG$|BȸC#"HxڮШM6_WR,"-{EZrJe|="qQC8rR/fᄕ2;3RRWW eG ᩕ@NPg{ʦErn8fk Nd`VF١ob +zJ8&fIJ .\X2fnϐ^ϣFm#5ɜeI\դRqz&ΟxeN+!-E\HRrg=;J(vtxtNkZ5cM5RxL6:h_سil.9IrRcB.Eޞ6^/m͜Lsp/K +endstream +endobj +12 0 obj +416 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073412+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000966 00000 n +0000000861 00000 n +0000000882 00000 n +0000000903 00000 n +0000000924 00000 n +0000000945 00000 n +0000000065 00000 n +0000000330 00000 n +0000000841 00000 n +0000000208 00000 n +0000000821 00000 n +0000001026 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1183 +%%EOF diff --git a/img/DeiT-S_fornet_all_linear_v2.pdf b/img/DeiT-S_fornet_all_linear_v2.pdf new file mode 100644 index 0000000..f0b227f --- /dev/null +++ b/img/DeiT-S_fornet_all_linear_v2.pdf @@ -0,0 +1,71 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xM0 <O!%"m.zAv)0sB'ARta?h?}ٿz}iz/xD$@2pASvF pd+UKRR"-9e|;"qQM8ŒzaIRqGZTQmGBv +Z ;Tr{B캅fN58W +endstream +endobj +12 0 obj +433 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073412+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000983 00000 n +0000000878 00000 n +0000000899 00000 n +0000000920 00000 n +0000000941 00000 n +0000000962 00000 n +0000000065 00000 n +0000000330 00000 n +0000000858 00000 n +0000000208 00000 n +0000000838 00000 n +0000001043 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1200 +%%EOF diff --git a/img/DeiT-S_fornet_all_linear_v3.pdf b/img/DeiT-S_fornet_all_linear_v3.pdf new file mode 100644 index 0000000..be979ae --- /dev/null +++ b/img/DeiT-S_fornet_all_linear_v3.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xnA E +.ҬMQ:AG&Eq^>~?}9~_ru  2 7`U$|k;~b'/Ht+IHj.\5R\#rgߘ4) nC+Q" 7VT s!(J)rI`,UYMآEIeo+aзbtsEj5U%iQN9]GMݞK؍A*,jV\k,]gn{Z]#u&ꄪ=(av26fήlR S(gVN9ڹ]}vz%)ǚ{j+ۤVN9u؁=Xҙ)fַ8QKrK/?e3 +endstream +endobj +12 0 obj +437 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.9.2, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.9.2) +/CreationDate (D:20250802073413+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000987 00000 n +0000000882 00000 n +0000000903 00000 n +0000000924 00000 n +0000000945 00000 n +0000000966 00000 n +0000000065 00000 n +0000000330 00000 n +0000000862 00000 n +0000000208 00000 n +0000000842 00000 n +0000001047 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1204 +%%EOF diff --git a/img/DeiT-S_fornet_all_linear_vNone.pdf b/img/DeiT-S_fornet_all_linear_vNone.pdf new file mode 100644 index 0000000..dad51e4 Binary files /dev/null and b/img/DeiT-S_fornet_all_linear_vNone.pdf differ diff --git a/img/ForAug-fig-1.pdf b/img/ForAug-fig-1.pdf new file mode 100644 index 0000000..33f9c75 Binary files /dev/null and b/img/ForAug-fig-1.pdf differ diff --git a/img/ResNet101_ImageNet_v1.pdf b/img/ResNet101_ImageNet_v1.pdf index 9a89e44..0000c01 100644 Binary files a/img/ResNet101_ImageNet_v1.pdf and b/img/ResNet101_ImageNet_v1.pdf differ diff --git a/img/ResNet101_ImageNet_v2.pdf b/img/ResNet101_ImageNet_v2.pdf index 10e02a8..06c7992 100644 Binary files a/img/ResNet101_ImageNet_v2.pdf and b/img/ResNet101_ImageNet_v2.pdf differ diff --git a/img/ResNet101_ImageNet_v3.pdf b/img/ResNet101_ImageNet_v3.pdf index f23929d..82752dd 100644 --- a/img/ResNet101_ImageNet_v3.pdf +++ b/img/ResNet101_ImageNet_v3.pdf @@ -42,9 +42,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094736+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150045+02'00') >> endobj xref 0 14 @@ -65,5 +65,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1204 +1206 %%EOF diff --git a/img/ResNet101_RecombNet all_v1.pdf b/img/ResNet101_RecombNet all_v1.pdf new file mode 100644 index 0000000..d2c4d1b Binary files /dev/null and b/img/ResNet101_RecombNet all_v1.pdf differ diff --git a/img/ResNet101_RecombNet all_v2.pdf b/img/ResNet101_RecombNet all_v2.pdf new file mode 100644 index 0000000..74734f4 Binary files /dev/null and b/img/ResNet101_RecombNet all_v2.pdf differ diff --git a/img/ResNet101_RecombNet all_v3.pdf b/img/ResNet101_RecombNet all_v3.pdf new file mode 100644 index 0000000..d97cbcc --- /dev/null +++ b/img/ResNet101_RecombNet all_v3.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn0 <O"şm[`m=@03mh +/Ę)0dI#m?|:xre OHxBX خg9~8OlmitJ&AµbV)ܑ'nI59E8-i"^شV jW ;@!i9:;SToI„j1/ -=S(v|߇> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150051+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000991 00000 n +0000000886 00000 n +0000000907 00000 n +0000000928 00000 n +0000000949 00000 n +0000000970 00000 n +0000000065 00000 n +0000000330 00000 n +0000000866 00000 n +0000000208 00000 n +0000000846 00000 n +0000001051 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1210 +%%EOF diff --git a/img/ResNet101_RecombNet_all_v1.pdf b/img/ResNet101_RecombNet_all_v1.pdf new file mode 100644 index 0000000..d2c4d1b Binary files /dev/null and b/img/ResNet101_RecombNet_all_v1.pdf differ diff --git a/img/ResNet101_RecombNet_all_v2.pdf b/img/ResNet101_RecombNet_all_v2.pdf new file mode 100644 index 0000000..74734f4 Binary files /dev/null and b/img/ResNet101_RecombNet_all_v2.pdf differ diff --git a/img/ResNet101_RecombNet_all_v3.pdf b/img/ResNet101_RecombNet_all_v3.pdf new file mode 100644 index 0000000..d97cbcc --- /dev/null +++ b/img/ResNet101_RecombNet_all_v3.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn0 <O"şm[`m=@03mh +/Ę)0dI#m?|:xre OHxBX خg9~8OlmitJ&AµbV)ܑ'nI59E8-i"^شV jW ;@!i9:;SToI„j1/ -=S(v|߇> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150051+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000991 00000 n +0000000886 00000 n +0000000907 00000 n +0000000928 00000 n +0000000949 00000 n +0000000970 00000 n +0000000065 00000 n +0000000330 00000 n +0000000866 00000 n +0000000208 00000 n +0000000846 00000 n +0000001051 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1210 +%%EOF diff --git a/img/ResNet50_ImageNet_v1.pdf b/img/ResNet50_ImageNet_v1.pdf index 953058c..32bc24f 100644 --- a/img/ResNet50_ImageNet_v1.pdf +++ b/img/ResNet50_ImageNet_v1.pdf @@ -45,9 +45,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094737+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150046+02'00') >> endobj xref 0 14 @@ -68,5 +68,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1210 +1212 %%EOF diff --git a/img/ResNet50_ImageNet_v2.pdf b/img/ResNet50_ImageNet_v2.pdf index ea19853..06ee1b4 100644 Binary files a/img/ResNet50_ImageNet_v2.pdf and b/img/ResNet50_ImageNet_v2.pdf differ diff --git a/img/ResNet50_ImageNet_v3.pdf b/img/ResNet50_ImageNet_v3.pdf index 3c2fd5c..e020acf 100644 Binary files a/img/ResNet50_ImageNet_v3.pdf and b/img/ResNet50_ImageNet_v3.pdf differ diff --git a/img/ResNet50_RecombNet all_v1.pdf b/img/ResNet50_RecombNet all_v1.pdf new file mode 100644 index 0000000..88ba2e9 --- /dev/null +++ b/img/ResNet50_RecombNet all_v1.pdf @@ -0,0 +1,69 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn@ <OCm.zvDvх0y!?|9z<|{OxB'$|G=2pVE羶h?l}z4 #4cDUcJprg1]itS-ͥ=#.Hp-k%viDZFRJ(rI4ٙ՝zbBž)}[@~Dg7+*^KzZ6E%<6r4w=vJ*,Qjq:S?ŭR-"ztEhyG .nMFVcs¢TK E9FTfu7Q:?YKDFm绲f*' OM!l\Gг _,+)U3>ߒvS //6g/98W ? +endstream +endobj +12 0 obj +437 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150050+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000987 00000 n +0000000882 00000 n +0000000903 00000 n +0000000924 00000 n +0000000945 00000 n +0000000966 00000 n +0000000065 00000 n +0000000330 00000 n +0000000862 00000 n +0000000208 00000 n +0000000842 00000 n +0000001047 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1206 +%%EOF diff --git a/img/ResNet50_RecombNet all_v2.pdf b/img/ResNet50_RecombNet all_v2.pdf new file mode 100644 index 0000000..d9c8523 Binary files /dev/null and b/img/ResNet50_RecombNet all_v2.pdf differ diff --git a/img/ResNet50_RecombNet all_v3.pdf b/img/ResNet50_RecombNet all_v3.pdf new file mode 100644 index 0000000..2259597 --- /dev/null +++ b/img/ResNet50_RecombNet all_v3.pdf @@ -0,0 +1,69 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn0 <OşmS`m=@03ih/7 C=_/x+0> 2 +>^@c|8?7Xa(RȋerFn|JGfi uN@\RW7L=ꤐHx҈j]%Bhvt0 5vfWҢ*Fe+SYgCENTR&̦bHK6ǂ:Xhj%Rd.e%qQdRǣz՟RdL:/B!j;sx4N7zӠ45v3&!L33W\+͛J){+ohusIR=  ׏mќI%х~[ ;a;cI  %v^oiN {/ +endstream +endobj +12 0 obj +430 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150050+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000980 00000 n +0000000875 00000 n +0000000896 00000 n +0000000917 00000 n +0000000938 00000 n +0000000959 00000 n +0000000065 00000 n +0000000330 00000 n +0000000855 00000 n +0000000208 00000 n +0000000835 00000 n +0000001040 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1199 +%%EOF diff --git a/img/ResNet50_RecombNet_all_v1.pdf b/img/ResNet50_RecombNet_all_v1.pdf new file mode 100644 index 0000000..88ba2e9 --- /dev/null +++ b/img/ResNet50_RecombNet_all_v1.pdf @@ -0,0 +1,69 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn@ <OCm.zvDvх0y!?|9z<|{OxB'$|G=2pVE羶h?l}z4 #4cDUcJprg1]itS-ͥ=#.Hp-k%viDZFRJ(rI4ٙ՝zbBž)}[@~Dg7+*^KzZ6E%<6r4w=vJ*,Qjq:S?ŭR-"ztEhyG .nMFVcs¢TK E9FTfu7Q:?YKDFm绲f*' OM!l\Gг _,+)U3>ߒvS //6g/98W ? +endstream +endobj +12 0 obj +437 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150050+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000987 00000 n +0000000882 00000 n +0000000903 00000 n +0000000924 00000 n +0000000945 00000 n +0000000966 00000 n +0000000065 00000 n +0000000330 00000 n +0000000862 00000 n +0000000208 00000 n +0000000842 00000 n +0000001047 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1206 +%%EOF diff --git a/img/ResNet50_RecombNet_all_v2.pdf b/img/ResNet50_RecombNet_all_v2.pdf new file mode 100644 index 0000000..d9c8523 Binary files /dev/null and b/img/ResNet50_RecombNet_all_v2.pdf differ diff --git a/img/ResNet50_RecombNet_all_v3.pdf b/img/ResNet50_RecombNet_all_v3.pdf new file mode 100644 index 0000000..2259597 --- /dev/null +++ b/img/ResNet50_RecombNet_all_v3.pdf @@ -0,0 +1,69 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn0 <OşmS`m=@03ih/7 C=_/x+0> 2 +>^@c|8?7Xa(RȋerFn|JGfi uN@\RW7L=ꤐHx҈j]%Bhvt0 5vfWҢ*Fe+SYgCENTR&̦bHK6ǂ:Xhj%Rd.e%qQdRǣz՟RdL:/B!j;sx4N7zӠ45v3&!L33W\+͛J){+ohusIR=  ׏mќI%х~[ ;a;cI  %v^oiN {/ +endstream +endobj +12 0 obj +430 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150050+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000980 00000 n +0000000875 00000 n +0000000896 00000 n +0000000917 00000 n +0000000938 00000 n +0000000959 00000 n +0000000065 00000 n +0000000330 00000 n +0000000855 00000 n +0000000208 00000 n +0000000835 00000 n +0000001040 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1199 +%%EOF diff --git a/img/Swin-S_ImageNet_v1.pdf b/img/Swin-S_ImageNet_v1.pdf index 3dcd902..ef09926 100644 --- a/img/Swin-S_ImageNet_v1.pdf +++ b/img/Swin-S_ImageNet_v1.pdf @@ -43,9 +43,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094738+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150047+02'00') >> endobj xref 0 14 @@ -66,5 +66,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1205 +1207 %%EOF diff --git a/img/Swin-S_ImageNet_v2.pdf b/img/Swin-S_ImageNet_v2.pdf index fda1945..cde8932 100644 Binary files a/img/Swin-S_ImageNet_v2.pdf and b/img/Swin-S_ImageNet_v2.pdf differ diff --git a/img/Swin-S_ImageNet_v3.pdf b/img/Swin-S_ImageNet_v3.pdf index 66df876..0d223e5 100644 Binary files a/img/Swin-S_ImageNet_v3.pdf and b/img/Swin-S_ImageNet_v3.pdf differ diff --git a/img/Swin-S_RecombNet all_v1.pdf b/img/Swin-S_RecombNet all_v1.pdf new file mode 100644 index 0000000..c3cfae6 Binary files /dev/null and b/img/Swin-S_RecombNet all_v1.pdf differ diff --git a/img/Swin-S_RecombNet all_v2.pdf b/img/Swin-S_RecombNet all_v2.pdf new file mode 100644 index 0000000..2be743b --- /dev/null +++ b/img/Swin-S_RecombNet all_v2.pdf @@ -0,0 +1,69 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn@ <OC6Nmh4vȦ a'J7_~>w_ܜ^HȸC#Hֺ=~c͋{ILFhe$Gԛ)uToROh)RHY,ɐuΪ*a$#a#BD\jW z+J(;hbj^ÚGX(H1)M:6z5=vkT\Hyy<~Ԟ?qCrD$fCՕTm6ʁ]֝F?7Wqz3m/v46G\6.ʕPn@s47/,E +endstream +endobj +12 0 obj +401 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150049+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000951 00000 n +0000000846 00000 n +0000000867 00000 n +0000000888 00000 n +0000000909 00000 n +0000000930 00000 n +0000000065 00000 n +0000000330 00000 n +0000000826 00000 n +0000000208 00000 n +0000000806 00000 n +0000001011 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1170 +%%EOF diff --git a/img/Swin-S_RecombNet all_v3.pdf b/img/Swin-S_RecombNet all_v3.pdf new file mode 100644 index 0000000..49c3b06 Binary files /dev/null and b/img/Swin-S_RecombNet all_v3.pdf differ diff --git a/img/Swin-S_RecombNet_all_v1.pdf b/img/Swin-S_RecombNet_all_v1.pdf new file mode 100644 index 0000000..c3cfae6 Binary files /dev/null and b/img/Swin-S_RecombNet_all_v1.pdf differ diff --git a/img/Swin-S_RecombNet_all_v2.pdf b/img/Swin-S_RecombNet_all_v2.pdf new file mode 100644 index 0000000..2be743b --- /dev/null +++ b/img/Swin-S_RecombNet_all_v2.pdf @@ -0,0 +1,69 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xMn@ <OC6Nmh4vȦ a'J7_~>w_ܜ^HȸC#Hֺ=~c͋{ILFhe$Gԛ)uToROh)RHY,ɐuΪ*a$#a#BD\jW z+J(;hbj^ÚGX(H1)M:6z5=vkT\Hyy<~Ԟ?qCrD$fCՕTm6ʁ]֝F?7Wqz3m/v46G\6.ʕPn@s47/,E +endstream +endobj +12 0 obj +401 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150049+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000951 00000 n +0000000846 00000 n +0000000867 00000 n +0000000888 00000 n +0000000909 00000 n +0000000930 00000 n +0000000065 00000 n +0000000330 00000 n +0000000826 00000 n +0000000208 00000 n +0000000806 00000 n +0000001011 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1170 +%%EOF diff --git a/img/Swin-S_RecombNet_all_v3.pdf b/img/Swin-S_RecombNet_all_v3.pdf new file mode 100644 index 0000000..49c3b06 Binary files /dev/null and b/img/Swin-S_RecombNet_all_v3.pdf differ diff --git a/img/Swin-Ti_ImageNet_v1.pdf b/img/Swin-Ti_ImageNet_v1.pdf index 07b7fcd..171804c 100644 Binary files a/img/Swin-Ti_ImageNet_v1.pdf and b/img/Swin-Ti_ImageNet_v1.pdf differ diff --git a/img/Swin-Ti_ImageNet_v2.pdf b/img/Swin-Ti_ImageNet_v2.pdf index 315a962..7e56d1c 100644 --- a/img/Swin-Ti_ImageNet_v2.pdf +++ b/img/Swin-Ti_ImageNet_v2.pdf @@ -44,9 +44,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250301004049+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150045+02'00') >> endobj xref 0 14 @@ -67,5 +67,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1207 +1209 %%EOF diff --git a/img/Swin-Ti_ImageNet_v3.pdf b/img/Swin-Ti_ImageNet_v3.pdf index d738b01..f6c1ac1 100644 --- a/img/Swin-Ti_ImageNet_v3.pdf +++ b/img/Swin-Ti_ImageNet_v3.pdf @@ -43,9 +43,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250301004048+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150045+02'00') >> endobj xref 0 14 @@ -66,5 +66,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1204 +1206 %%EOF diff --git a/img/Swin-Ti_RecombNet all_v1.pdf b/img/Swin-Ti_RecombNet all_v1.pdf new file mode 100644 index 0000000..3c43d4b Binary files /dev/null and b/img/Swin-Ti_RecombNet all_v1.pdf differ diff --git a/img/Swin-Ti_RecombNet all_v2.pdf b/img/Swin-Ti_RecombNet all_v2.pdf new file mode 100644 index 0000000..757529b Binary files /dev/null and b/img/Swin-Ti_RecombNet all_v2.pdf differ diff --git a/img/Swin-Ti_RecombNet all_v3.pdf b/img/Swin-Ti_RecombNet all_v3.pdf new file mode 100644 index 0000000..6f7cc11 Binary files /dev/null and b/img/Swin-Ti_RecombNet all_v3.pdf differ diff --git a/img/Swin-Ti_RecombNet_all_v1.pdf b/img/Swin-Ti_RecombNet_all_v1.pdf new file mode 100644 index 0000000..3c43d4b Binary files /dev/null and b/img/Swin-Ti_RecombNet_all_v1.pdf differ diff --git a/img/Swin-Ti_RecombNet_all_v2.pdf b/img/Swin-Ti_RecombNet_all_v2.pdf new file mode 100644 index 0000000..757529b Binary files /dev/null and b/img/Swin-Ti_RecombNet_all_v2.pdf differ diff --git a/img/Swin-Ti_RecombNet_all_v3.pdf b/img/Swin-Ti_RecombNet_all_v3.pdf new file mode 100644 index 0000000..6f7cc11 Binary files /dev/null and b/img/Swin-Ti_RecombNet_all_v3.pdf differ diff --git a/img/ViT-B_ImageNet_v1.pdf b/img/ViT-B_ImageNet_v1.pdf index 7e7462a..8319743 100644 --- a/img/ViT-B_ImageNet_v1.pdf +++ b/img/ViT-B_ImageNet_v1.pdf @@ -43,9 +43,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094737+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150046+02'00') >> endobj xref 0 14 @@ -66,5 +66,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1207 +1209 %%EOF diff --git a/img/ViT-B_ImageNet_v2.pdf b/img/ViT-B_ImageNet_v2.pdf index 7836f68..d3cc4d6 100644 Binary files a/img/ViT-B_ImageNet_v2.pdf and b/img/ViT-B_ImageNet_v2.pdf differ diff --git a/img/ViT-B_ImageNet_v3.pdf b/img/ViT-B_ImageNet_v3.pdf index a159302..0844562 100644 --- a/img/ViT-B_ImageNet_v3.pdf +++ b/img/ViT-B_ImageNet_v3.pdf @@ -42,9 +42,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094736+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150045+02'00') >> endobj xref 0 14 @@ -65,5 +65,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1191 +1193 %%EOF diff --git a/img/ViT-B_RecombNet all_v1.pdf b/img/ViT-B_RecombNet all_v1.pdf new file mode 100644 index 0000000..2a70916 --- /dev/null +++ b/img/ViT-B_RecombNet all_v1.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xKADy<'[[@XpmM#1Hp}TE#V|i?}8|t|?O`|  od<" `U$|g{+~`7N#jHt3%OG:rppŷ rg῿1]ivS-ͣ]3!g!F S:#̙*gF4J(rR} WPgתb-JWv j㵊s2%l,:uu1MuJ2L%r\j1eAQF9]Vmϣ_m6"UX,l^BGޞL}[=d +h b.B (;BxNrkjk>ᵡ65(kXM=6Qr6*^(A `⓾rb{[ܫf_[D)݀Ww36 ^nn^Vsp +endstream +endobj +12 0 obj +437 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150049+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000987 00000 n +0000000882 00000 n +0000000903 00000 n +0000000924 00000 n +0000000945 00000 n +0000000966 00000 n +0000000065 00000 n +0000000330 00000 n +0000000862 00000 n +0000000208 00000 n +0000000842 00000 n +0000001047 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1206 +%%EOF diff --git a/img/ViT-B_RecombNet all_v2.pdf b/img/ViT-B_RecombNet all_v2.pdf new file mode 100644 index 0000000..93fc253 Binary files /dev/null and b/img/ViT-B_RecombNet all_v2.pdf differ diff --git a/img/ViT-B_RecombNet all_v3.pdf b/img/ViT-B_RecombNet all_v3.pdf new file mode 100644 index 0000000..e2bba06 Binary files /dev/null and b/img/ViT-B_RecombNet all_v3.pdf differ diff --git a/img/ViT-B_RecombNet_all_v1.pdf b/img/ViT-B_RecombNet_all_v1.pdf new file mode 100644 index 0000000..2a70916 --- /dev/null +++ b/img/ViT-B_RecombNet_all_v1.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xKADy<'[[@XpmM#1Hp}TE#V|i?}8|t|?O`|  od<" `U$|g{+~`7N#jHt3%OG:rppŷ rg῿1]ivS-ͣ]3!g!F S:#̙*gF4J(rR} WPgתb-JWv j㵊s2%l,:uu1MuJ2L%r\j1eAQF9]Vmϣ_m6"UX,l^BGޞL}[=d +h b.B (;BxNrkjk>ᵡ65(kXM=6Qr6*^(A `⓾rb{[ܫf_[D)݀Ww36 ^nn^Vsp +endstream +endobj +12 0 obj +437 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150049+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000987 00000 n +0000000882 00000 n +0000000903 00000 n +0000000924 00000 n +0000000945 00000 n +0000000966 00000 n +0000000065 00000 n +0000000330 00000 n +0000000862 00000 n +0000000208 00000 n +0000000842 00000 n +0000001047 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1206 +%%EOF diff --git a/img/ViT-B_RecombNet_all_v2.pdf b/img/ViT-B_RecombNet_all_v2.pdf new file mode 100644 index 0000000..93fc253 Binary files /dev/null and b/img/ViT-B_RecombNet_all_v2.pdf differ diff --git a/img/ViT-B_RecombNet_all_v3.pdf b/img/ViT-B_RecombNet_all_v3.pdf new file mode 100644 index 0000000..e2bba06 Binary files /dev/null and b/img/ViT-B_RecombNet_all_v3.pdf differ diff --git a/img/ViT-L_ImageNet_v1.pdf b/img/ViT-L_ImageNet_v1.pdf index f2769b8..5c4d8b8 100644 Binary files a/img/ViT-L_ImageNet_v1.pdf and b/img/ViT-L_ImageNet_v1.pdf differ diff --git a/img/ViT-L_ImageNet_v2.pdf b/img/ViT-L_ImageNet_v2.pdf index 25fda26..b4b082e 100644 Binary files a/img/ViT-L_ImageNet_v2.pdf and b/img/ViT-L_ImageNet_v2.pdf differ diff --git a/img/ViT-L_ImageNet_v3.pdf b/img/ViT-L_ImageNet_v3.pdf index 07e9d9a..bf94eea 100644 --- a/img/ViT-L_ImageNet_v3.pdf +++ b/img/ViT-L_ImageNet_v3.pdf @@ -42,9 +42,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094738+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150047+02'00') >> endobj xref 0 14 @@ -65,5 +65,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1182 +1184 %%EOF diff --git a/img/ViT-L_RecombNet all_v1.pdf b/img/ViT-L_RecombNet all_v1.pdf new file mode 100644 index 0000000..8f46491 Binary files /dev/null and b/img/ViT-L_RecombNet all_v1.pdf differ diff --git a/img/ViT-L_RecombNet all_v2.pdf b/img/ViT-L_RecombNet all_v2.pdf new file mode 100644 index 0000000..a06ee6d --- /dev/null +++ b/img/ViT-L_RecombNet all_v2.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xAn0 E<O")۶.z`&mh +/DK3c ,}O~t?~o`|'$|F¿x@' $\U%k^Fp_es'QMEQţ4}l5ݹrNpQ='8sȢ;)$"q"W ;\!jBn`J.pߞAa(bRԛ!-]"điմswSn]cFN(5Wai4wyl^g'nⷐV)9V̛ )#m%^z Fy[<Fo)aڹ^}vo7[KB^&5;r`cž\8H&Uv^/mh^98g} +endstream +endobj +12 0 obj +421 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150048+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000971 00000 n +0000000866 00000 n +0000000887 00000 n +0000000908 00000 n +0000000929 00000 n +0000000950 00000 n +0000000065 00000 n +0000000330 00000 n +0000000846 00000 n +0000000208 00000 n +0000000826 00000 n +0000001031 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1190 +%%EOF diff --git a/img/ViT-L_RecombNet all_v3.pdf b/img/ViT-L_RecombNet all_v3.pdf new file mode 100644 index 0000000..c7a96d9 Binary files /dev/null and b/img/ViT-L_RecombNet all_v3.pdf differ diff --git a/img/ViT-L_RecombNet_all_v1.pdf b/img/ViT-L_RecombNet_all_v1.pdf new file mode 100644 index 0000000..8f46491 Binary files /dev/null and b/img/ViT-L_RecombNet_all_v1.pdf differ diff --git a/img/ViT-L_RecombNet_all_v2.pdf b/img/ViT-L_RecombNet_all_v2.pdf new file mode 100644 index 0000000..a06ee6d --- /dev/null +++ b/img/ViT-L_RecombNet_all_v2.pdf @@ -0,0 +1,70 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xAn0 E<O")۶.z`&mh +/DK3c ,}O~t?~o`|'$|F¿x@' $\U%k^Fp_es'QMEQţ4}l5ݹrNpQ='8sȢ;)$"q"W ;\!jBn`J.pߞAa(bRԛ!-]"điմswSn]cFN(5Wai4wyl^g'nⷐV)9V̛ )#m%^z Fy[<Fo)aڹ^}vo7[KB^&5;r`cž\8H&Uv^/mh^98g} +endstream +endobj +12 0 obj +421 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150048+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000971 00000 n +0000000866 00000 n +0000000887 00000 n +0000000908 00000 n +0000000929 00000 n +0000000950 00000 n +0000000065 00000 n +0000000330 00000 n +0000000846 00000 n +0000000208 00000 n +0000000826 00000 n +0000001031 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1190 +%%EOF diff --git a/img/ViT-L_RecombNet_all_v3.pdf b/img/ViT-L_RecombNet_all_v3.pdf new file mode 100644 index 0000000..c7a96d9 Binary files /dev/null and b/img/ViT-L_RecombNet_all_v3.pdf differ diff --git a/img/ViT-S_ImageNet_v1.pdf b/img/ViT-S_ImageNet_v1.pdf index 9daf666..af1acb7 100644 Binary files a/img/ViT-S_ImageNet_v1.pdf and b/img/ViT-S_ImageNet_v1.pdf differ diff --git a/img/ViT-S_ImageNet_v2.pdf b/img/ViT-S_ImageNet_v2.pdf index 364b437..25f10a4 100644 Binary files a/img/ViT-S_ImageNet_v2.pdf and b/img/ViT-S_ImageNet_v2.pdf differ diff --git a/img/ViT-S_ImageNet_v3.pdf b/img/ViT-S_ImageNet_v3.pdf index 30ae7a5..6773bd9 100644 --- a/img/ViT-S_ImageNet_v3.pdf +++ b/img/ViT-S_ImageNet_v3.pdf @@ -43,9 +43,9 @@ endobj << /Type /Pages /Kids [ 11 0 R ] /Count 1 >> endobj 13 0 obj -<< /Creator (Matplotlib v3.9.4, https://matplotlib.org) -/Producer (Matplotlib pdf backend v3.9.4) -/CreationDate (D:20250227094737+02'00') >> +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150046+02'00') >> endobj xref 0 14 @@ -66,5 +66,5 @@ xref trailer << /Size 14 /Root 1 0 R /Info 13 0 R >> startxref -1210 +1212 %%EOF diff --git a/img/ViT-S_RecombNet all_v1.pdf b/img/ViT-S_RecombNet all_v1.pdf new file mode 100644 index 0000000..9d82eb3 Binary files /dev/null and b/img/ViT-S_RecombNet all_v1.pdf differ diff --git a/img/ViT-S_RecombNet all_v2.pdf b/img/ViT-S_RecombNet all_v2.pdf new file mode 100644 index 0000000..56a7427 Binary files /dev/null and b/img/ViT-S_RecombNet all_v2.pdf differ diff --git a/img/ViT-S_RecombNet all_v3.pdf b/img/ViT-S_RecombNet all_v3.pdf new file mode 100644 index 0000000..1581fd3 --- /dev/null +++ b/img/ViT-S_RecombNet all_v3.pdf @@ -0,0 +1,71 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xnA E +.Ҭt$ꕙ&Eq>~?}9~_ru  2 7`U$|k; ?×~gC:9IZr"t̊/ȝy~c4P[K VNUDrn̊va.ŵxW ;@n!iLCV$[+UR\Om%v\LSnd颭fI)RUPΐwStVzsw1CPZXqY~_M +cmYœHVo a'l;ec6XZd%%\r]7caz%)ڷPRhZOԂIWIrbf267xp>*}hjP߷Nv yu>_7 +endstream +endobj +12 0 obj +441 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150051+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000991 00000 n +0000000886 00000 n +0000000907 00000 n +0000000928 00000 n +0000000949 00000 n +0000000970 00000 n +0000000065 00000 n +0000000330 00000 n +0000000866 00000 n +0000000208 00000 n +0000000846 00000 n +0000001051 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1210 +%%EOF diff --git a/img/ViT-S_RecombNet_all_v1.pdf b/img/ViT-S_RecombNet_all_v1.pdf new file mode 100644 index 0000000..9d82eb3 Binary files /dev/null and b/img/ViT-S_RecombNet_all_v1.pdf differ diff --git a/img/ViT-S_RecombNet_all_v2.pdf b/img/ViT-S_RecombNet_all_v2.pdf new file mode 100644 index 0000000..56a7427 Binary files /dev/null and b/img/ViT-S_RecombNet_all_v2.pdf differ diff --git a/img/ViT-S_RecombNet_all_v3.pdf b/img/ViT-S_RecombNet_all_v3.pdf new file mode 100644 index 0000000..1581fd3 --- /dev/null +++ b/img/ViT-S_RecombNet_all_v3.pdf @@ -0,0 +1,71 @@ +%PDF-1.4 +% +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +8 0 obj +<< /Font 3 0 R /XObject 7 0 R /ExtGState 4 0 R /Pattern 5 0 R +/Shading 6 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> +endobj +11 0 obj +<< /Type /Page /Parent 2 0 R /Resources 8 0 R /MediaBox [ 0 0 144 144 ] +/Contents 9 0 R /Annots 10 0 R >> +endobj +9 0 obj +<< /Length 12 0 R /Filter /FlateDecode >> +stream +xnA E +.Ҭt$ꕙ&Eq>~?}9~_ru  2 7`U$|k; ?×~gC:9IZr"t̊/ȝy~c4P[K VNUDrn̊va.ŵxW ;@n!iLCV$[+UR\Om%v\LSnd颭fI)RUPΐwStVzsw1CPZXqY~_M +cmYœHVo a'l;ec6XZd%%\r]7caz%)ڷPRhZOԂIWIrbf267xp>*}hjP߷Nv yu>_7 +endstream +endobj +12 0 obj +441 +endobj +10 0 obj +[ ] +endobj +3 0 obj +<< >> +endobj +4 0 obj +<< >> +endobj +5 0 obj +<< >> +endobj +6 0 obj +<< >> +endobj +7 0 obj +<< >> +endobj +2 0 obj +<< /Type /Pages /Kids [ 11 0 R ] /Count 1 >> +endobj +13 0 obj +<< /Creator (Matplotlib v3.10.1, https://matplotlib.org) +/Producer (Matplotlib pdf backend v3.10.1) +/CreationDate (D:20250724150051+02'00') >> +endobj +xref +0 14 +0000000000 65535 f +0000000016 00000 n +0000000991 00000 n +0000000886 00000 n +0000000907 00000 n +0000000928 00000 n +0000000949 00000 n +0000000970 00000 n +0000000065 00000 n +0000000330 00000 n +0000000866 00000 n +0000000208 00000 n +0000000846 00000 n +0000001051 00000 n +trailer +<< /Size 14 /Root 1 0 R /Info 13 0 R >> +startxref +1210 +%%EOF diff --git a/img/background no icon.png b/img/background no icon.png new file mode 100644 index 0000000..ab24e9b Binary files /dev/null and b/img/background no icon.png differ diff --git a/img/bg_robustness.pdf b/img/bg_robustness.pdf new file mode 100644 index 0000000..dcfd724 Binary files /dev/null and b/img/bg_robustness.pdf differ diff --git a/img/bg_robustness_4.pdf b/img/bg_robustness_4.pdf new file mode 100644 index 0000000..7b7d3be Binary files /dev/null and b/img/bg_robustness_4.pdf differ diff --git a/img/color change icon.png b/img/color change icon.png new file mode 100644 index 0000000..98e7b73 Binary files /dev/null and b/img/color change icon.png differ diff --git a/img/colorbar_horizontal.pdf b/img/colorbar_horizontal.pdf index 141aa5f..8779c08 100644 Binary files a/img/colorbar_horizontal.pdf and b/img/colorbar_horizontal.pdf differ diff --git a/img/colorbar_vertical.pdf b/img/colorbar_vertical.pdf index 4c42a00..754158e 100644 Binary files a/img/colorbar_vertical.pdf and b/img/colorbar_vertical.pdf differ diff --git a/img/edge blur icon.png b/img/edge blur icon.png new file mode 100644 index 0000000..e0b7c7f Binary files /dev/null and b/img/edge blur icon.png differ diff --git a/img/extraction icon.png b/img/extraction icon.png new file mode 100644 index 0000000..3084923 Binary files /dev/null and b/img/extraction icon.png differ diff --git a/img/fg_focus.pdf b/img/fg_focus.pdf new file mode 100644 index 0000000..77b9b97 Binary files /dev/null and b/img/fg_focus.pdf differ diff --git a/img/fig-1.drawio b/img/fig-1.drawio index 00f822f..0b7f8c2 100644 --- a/img/fig-1.drawio +++ b/img/fig-1.drawio @@ -1,6 +1,6 @@ - + - + @@ -20,9 +20,9 @@ - + - + @@ -121,7 +121,7 @@ - + diff --git a/img/fig-1.drawio.png b/img/fig-1.drawio.png new file mode 100644 index 0000000..7a0b90b Binary files /dev/null and b/img/fig-1.drawio.png differ diff --git a/img/fig-1.pdf b/img/fig-1.pdf index 33f9c75..08adcb3 100644 Binary files a/img/fig-1.pdf and b/img/fig-1.pdf differ diff --git a/img/fig-1.png b/img/fig-1.png new file mode 100644 index 0000000..7863c38 Binary files /dev/null and b/img/fig-1.png differ diff --git a/img/fig-2-horizontal.drawio b/img/fig-2-horizontal.drawio new file mode 100644 index 0000000..e21f1ff --- /dev/null +++ b/img/fig-2-horizontal.drawio @@ -0,0 +1,406 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/img/fig-2.jpg b/img/fig-2-old.jpg similarity index 100% rename from img/fig-2.jpg rename to img/fig-2-old.jpg diff --git a/img/fig-2-old.pdf b/img/fig-2-old.pdf new file mode 100644 index 0000000..c8abc84 Binary files /dev/null and b/img/fig-2-old.pdf differ diff --git a/img/fig-2.drawio b/img/fig-2.drawio index 522fbbf..34ea040 100644 --- a/img/fig-2.drawio +++ b/img/fig-2.drawio @@ -1,240 +1,284 @@ - + - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - + + + + - - + + + + + - - + + - - + + - - + + - - + + - - - - - - - - + + - - + + - - + + - - + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + diff --git a/img/fig-2.pdf b/img/fig-2.pdf index c8abc84..14d154e 100644 Binary files a/img/fig-2.pdf and b/img/fig-2.pdf differ diff --git a/img/fig-2.png b/img/fig-2.png new file mode 100644 index 0000000..7dcbd15 Binary files /dev/null and b/img/fig-2.png differ diff --git a/img/fig-2_old.pdf b/img/fig-2_old.pdf new file mode 100644 index 0000000..6b03135 Binary files /dev/null and b/img/fig-2_old.pdf differ diff --git a/img/filter icon.png b/img/filter icon.png new file mode 100644 index 0000000..d7a71f8 Binary files /dev/null and b/img/filter icon.png differ diff --git a/img/foraug-gif.gif b/img/foraug-gif.gif new file mode 100644 index 0000000..a6ac2de Binary files /dev/null and b/img/foraug-gif.gif differ diff --git a/img/foraug-gif.webp b/img/foraug-gif.webp new file mode 100644 index 0000000..4bcf165 Binary files /dev/null and b/img/foraug-gif.webp differ diff --git a/img/foreground yes icon.png b/img/foreground yes icon.png new file mode 100644 index 0000000..a38f215 Binary files /dev/null and b/img/foreground yes icon.png differ diff --git a/img/fornet-gif.drawio b/img/fornet-gif.drawio new file mode 100644 index 0000000..4d79507 --- /dev/null +++ b/img/fornet-gif.drawio @@ -0,0 +1,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/img/gif-bg.png b/img/gif-bg.png new file mode 100644 index 0000000..e363c63 Binary files /dev/null and b/img/gif-bg.png differ diff --git a/img/gif-fg.png b/img/gif-fg.png new file mode 100644 index 0000000..d4d5ddc Binary files /dev/null and b/img/gif-fg.png differ diff --git a/img/gif-final-0.png b/img/gif-final-0.png new file mode 100644 index 0000000..daf8710 Binary files /dev/null and b/img/gif-final-0.png differ diff --git a/img/gif-final-1.png b/img/gif-final-1.png new file mode 100644 index 0000000..0612ab1 Binary files /dev/null and b/img/gif-final-1.png differ diff --git a/img/gif-final-2.png b/img/gif-final-2.png new file mode 100644 index 0000000..0fffb6e Binary files /dev/null and b/img/gif-final-2.png differ diff --git a/img/gif-final-3.png b/img/gif-final-3.png new file mode 100644 index 0000000..b1c98d0 Binary files /dev/null and b/img/gif-final-3.png differ diff --git a/img/gif-final-4.png b/img/gif-final-4.png new file mode 100644 index 0000000..5a1b48d Binary files /dev/null and b/img/gif-final-4.png differ diff --git a/img/gif-final-5.png b/img/gif-final-5.png new file mode 100644 index 0000000..cd68936 Binary files /dev/null and b/img/gif-final-5.png differ diff --git a/img/gif-final-6.png b/img/gif-final-6.png new file mode 100644 index 0000000..fe699e3 Binary files /dev/null and b/img/gif-final-6.png differ diff --git a/img/gif-final-7.png b/img/gif-final-7.png new file mode 100644 index 0000000..68fb544 Binary files /dev/null and b/img/gif-final-7.png differ diff --git a/img/gif-final-8.png b/img/gif-final-8.png new file mode 100644 index 0000000..765dcbc Binary files /dev/null and b/img/gif-final-8.png differ diff --git a/img/gif-final-9.png b/img/gif-final-9.png new file mode 100644 index 0000000..07c03f1 Binary files /dev/null and b/img/gif-final-9.png differ diff --git a/img/gif-inp.png b/img/gif-inp.png new file mode 100644 index 0000000..e8d211e Binary files /dev/null and b/img/gif-inp.png differ diff --git a/img/horizontal flip icon new.png b/img/horizontal flip icon new.png new file mode 100644 index 0000000..ba2751b Binary files /dev/null and b/img/horizontal flip icon new.png differ diff --git a/img/horse_mask_1.WEBP b/img/horse_mask_1.WEBP new file mode 100644 index 0000000..ad7c8f2 Binary files /dev/null and b/img/horse_mask_1.WEBP differ diff --git a/img/horse_mask_2.WEBP b/img/horse_mask_2.WEBP new file mode 100644 index 0000000..f7d5c81 Binary files /dev/null and b/img/horse_mask_2.WEBP differ diff --git a/img/infill icon.png b/img/infill icon.png new file mode 100644 index 0000000..6529ae1 Binary files /dev/null and b/img/infill icon.png differ diff --git a/img/infill_distr.pdf b/img/infill_distr.pdf new file mode 100644 index 0000000..c946d8c Binary files /dev/null and b/img/infill_distr.pdf differ diff --git a/img/object size icon.png b/img/object size icon.png new file mode 100644 index 0000000..b21f09c Binary files /dev/null and b/img/object size icon.png differ diff --git a/img/random crop icon.png b/img/random crop icon.png new file mode 100644 index 0000000..b779465 Binary files /dev/null and b/img/random crop icon.png differ diff --git a/img/random draw icon.png b/img/random draw icon.png new file mode 100644 index 0000000..9dca91e Binary files /dev/null and b/img/random draw icon.png differ diff --git a/img/random flipping icon.png b/img/random flipping icon.png new file mode 100644 index 0000000..cf44d5b Binary files /dev/null and b/img/random flipping icon.png differ diff --git a/img/random position icon.png b/img/random position icon.png new file mode 100644 index 0000000..793e2c0 Binary files /dev/null and b/img/random position icon.png differ diff --git a/img/random resize icon.png b/img/random resize icon.png new file mode 100644 index 0000000..d7f535f Binary files /dev/null and b/img/random resize icon.png differ diff --git a/img/rotation icon.png b/img/rotation icon.png new file mode 100644 index 0000000..96fe089 Binary files /dev/null and b/img/rotation icon.png differ diff --git a/img/segmentation icon.png b/img/segmentation icon.png new file mode 100644 index 0000000..8e8232d Binary files /dev/null and b/img/segmentation icon.png differ diff --git a/img/size_bias.pdf b/img/size_bias.pdf index 0515183..9706f2e 100644 Binary files a/img/size_bias.pdf and b/img/size_bias.pdf differ diff --git a/img/size_bias_grid.pdf b/img/size_bias_grid.pdf new file mode 100644 index 0000000..c6f3296 Binary files /dev/null and b/img/size_bias_grid.pdf differ diff --git a/main.bib b/main.bib new file mode 100644 index 0000000..a4682b7 --- /dev/null +++ b/main.bib @@ -0,0 +1,9033 @@ +@Misc{Liang2022, + author = {Liang, Paul Pu and Zadeh, Amir and Morency, Louis-Philippe}, + title = {Foundations and Recent Trends in Multimodal Machine Learning: Principles, Challenges, and Open Questions}, + doi = {10.48550/ARXIV.2209.03430}, + url = {joplin://x-callback-url/openNote?id=fe93b7e173f4478da42f09cbfdd379ea}, + file = {:Liang2022 - Foundations and Recent Trends in Multimodal Machine Learning_ Principles, Challenges, and Open Questions.pdf:PDF}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2022}, +} + +@Article{Weston2014, + author = {Weston, Jason and Chopra, Sumit and Bordes, Antoine}, + title = {Memory Networks}, + doi = {10.48550/ARXIV.1410.3916}, + file = {:Weston2014 - Memory Networks.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2014}, +} + +@Misc{Graves2014, + author = {Graves, Alex and Wayne, Greg and Danihelka, Ivo}, + title = {Neural Turing Machines}, + doi = {10.48550/ARXIV.1410.5401}, + file = {:Graves2014 - Neural Turing Machines.pdf:PDF}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2014}, +} + +@Article{LeCun2022, + author = {LeCun, Yann}, + title = {A path towards autonomous machine intelligence}, + file = {:LeCun2022 - A Path Towards Autonomous Machine Intelligence.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2022}, +} + +@Article{Bianchi2020, + author = {{Bianchi}, Federico and {Rossiello}, Gaetano and {Costabello}, Luca and {Palmonari}, Matteo and {Minervini}, Pasquale}, + title = {{Knowledge Graph Embeddings and Explainable AI}}, + doi = {10.48550/arxiv.2004.14843}, + eprint = {2004.14843}, + archiveprefix = {arXiv}, + file = {:Bianchi2020 - Knowledge Graph Embeddings and Explainable AI.pdf:PDF}, + primaryclass = {cs.AI}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@Article{Hitzler2020, + author = {Hitzler, Pascal and Janowicz, Krzysztof and Lecue, Freddy}, + title = {On the Role of Knowledge Graphs in Explainable AI}, + doi = {10.3233/SW-190374}, + number = {1}, + pages = {41–51}, + volume = {11}, + address = {NLD}, + file = {:Hitzler2020 - On the Role of Knowledge Graphs in Explainable AI.pdf:PDF}, + issue_date = {2020}, + journal = {Semant. Web}, + month = {jan}, + publisher = {IOS Press}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2020}, +} + +@InProceedings{Jaderberg2015, + author = {Jaderberg, Max and Simonyan, Karen and Zisserman, Andrew and Kavukcuoglu, Koray}, + booktitle = {Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 2}, + title = {Spatial Transformer Networks}, + location = {Montreal, Canada}, + pages = {2017–2025}, + publisher = {MIT Press}, + series = {NIPS'15}, + address = {Cambridge, MA, USA}, + file = {:Jaderberg2015 - Spatial Transformer Networks.pdf:PDF}, + numpages = {9}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2015}, +} + +@Misc{Chen2022, + author = {Chen, Zhenghua and Wu, Min and Chan, Alvin and Li, Xiaoli and Ong, Yew-Soon}, + title = {A Survey on AI Sustainability: Emerging Trends on Learning Algorithms and Research Challenges}, + doi = {10.48550/ARXIV.2205.03824}, + file = {:Chen2022 - A Survey on AI Sustainability_ Emerging Trends on Learning Algorithms and Research Challenges.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Wynsberghe2021, + author = {van Wynsberghe, Aimee}, + title = {Sustainable AI: AI for sustainability and the sustainability of AI}, + doi = {10.1007/s43681-021-00043-6}, + number = {3}, + pages = {213-218}, + volume = {1}, + file = {:Wynsberghe2021 - Sustainable AI_ AI for Sustainability and the Sustainability of AI.pdf:PDF}, + journal = {AI and Ethics}, + month = {Aug}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2021}, +} + +@InProceedings{Marino2019, + author = {Kenneth Marino and Mohammad Rastegari and Ali Farhadi and Roozbeh Mottaghi}, + booktitle = {Conference on Computer Vision and Pattern Recognition (CVPR)}, + title = {OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge}, + file = {:Marino2019 - OK VQA_ a Visual Question Answering Benchmark Requiring External Knowledge.pdf:PDF}, + groups = {Datasets}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2019}, +} + +@Article{Schwenk2022, + author = {Dustin Schwenk and Apoorv Khandelwal and Christopher Clark and Kenneth Marino and Roozbeh Mottaghi}, + title = {A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge}, + file = {:Schwenk2022 - A OKVQA_ a Benchmark for Visual Question Answering Using World Knowledge.pdf:PDF}, + groups = {Datasets}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2022}, +} + +@InProceedings{Gui2022, + author = {Gui, Liangke and Wang, Borui and Huang, Qiuyuan and Hauptmann, Alexander and Bisk, Yonatan and Gao, Jianfeng}, + booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, + title = {{KAT}: A Knowledge Augmented Transformer for Vision-and-Language}, + doi = {10.18653/v1/2022.naacl-main.70}, + pages = {956--968}, + publisher = {Association for Computational Linguistics}, + address = {Seattle, United States}, + file = {:Gui2022 - KAT_ a Knowledge Augmented Transformer for Vision and Language.pdf:PDF}, + month = jul, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Vaswani2017, + author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Lukasz and Polosukhin, Illia}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {Attention is All you Need}, + editor = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett}, + publisher = {Curran Associates, Inc.}, + volume = {30}, + file = {:Vaswani2017 - Attention Is All You Need.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2017}, +} + +@InProceedings{Antol2015, + author = {Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh}, + booktitle = {International Conference on Computer Vision (ICCV)}, + title = {{VQA}: {V}isual {Q}uestion {A}nswering}, + file = {:Antol2015 - VQA_ Visual Question Answering.pdf:PDF}, + groups = {Datasets}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2015}, +} + +@Article{Raffel2020, + author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu}, + title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}, + number = {140}, + pages = {1--67}, + volume = {21}, + file = {:Raffel2020 - Exploring the Limits of Transfer Learning with a Unified Text to Text Transformer.pdf:PDF}, + journal = {Journal of Machine Learning Research}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@InProceedings{Izacard2021, + author = {Izacard, Gautier and Grave, Edouard}, + booktitle = {Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume}, + title = {Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering}, + doi = {10.18653/v1/2021.eacl-main.74}, + pages = {874--880}, + publisher = {Association for Computational Linguistics}, + address = {Online}, + file = {:Izacard2021 - Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering.pdf:PDF}, + month = apr, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2021}, +} + +@Misc{Izacard2020, + author = {Gautier Izacard and Edouard Grave}, + title = {Distilling Knowledge from Reader to Retriever for Question Answering}, + doi = {10.48550/arxiv.2012.04584}, + eprint = {2012.04584}, + archiveprefix = {arXiv}, + file = {:Izacard2020 - Distilling Knowledge from Reader to Retriever for Question Answering.pdf:PDF}, + primaryclass = {cs.CL}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@InProceedings{Radford2021, + author = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya}, + booktitle = {Proceedings of the 38th International Conference on Machine Learning}, + title = {Learning Transferable Visual Models From Natural Language Supervision}, + editor = {Meila, Marina and Zhang, Tong}, + pages = {8748--8763}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + volume = {139}, + file = {:Radford2021 - Learning Transferable Visual Models from Natural Language Supervision.pdf:PDF}, + month = {18--24 Jul}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Yang2022, + author = {Yang, Zhengyuan and Gan, Zhe and Wang, Jianfeng and Hu, Xiaowei and Lu, Yumao and Liu, Zicheng and Wang, Lijuan}, + booktitle = {AAAI}, + title = {An Empirical Study of GPT-3 for Few-Shot Knowledge-Based VQA}, + file = {:Yang2022 - An Empirical Study of GPT 3 for Few Shot Knowledge Based VQA.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Misc{Li2022, + author = {Li, Chenliang and Xu, Haiyang and Tian, Junfeng and Wang, Wei and Yan, Ming and Bi, Bin and Ye, Jiabo and Chen, Hehong and Xu, Guohai and Cao, Zheng and Zhang, Ji and Huang, Songfang and Huang, Fei and Zhou, Jingren and Si, Luo}, + title = {mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections}, + doi = {10.48550/ARXIV.2205.12005}, + file = {:Li2022 - MPLUG_ Effective and Efficient Vision Language Learning by Cross Modal Skip Connections.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{LeeThorp2022, + author = {James P Lee-Thorp and Joshua Ainslie and Ilya Eckstein and Santiago Onta{\~n}{\'o}n}, + booktitle = {NAACL}, + title = {FNet: Mixing Tokens with Fourier Transforms}, + file = {:LeeThorp2022 - FNet_ Mixing Tokens with Fourier Transforms.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Misc{Touvron2021, + author = {Touvron, Hugo and Cord, Matthieu and El-Nouby, Alaaeldin and Bojanowski, Piotr and Joulin, Armand and Synnaeve, Gabriel and Jégou, Hervé}, + title = {Augmenting Convolutional networks with attention-based aggregation}, + doi = {10.48550/ARXIV.2112.13692}, + file = {:Touvron2021 - Augmenting Convolutional Networks with Attention Based Aggregation.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Touvron2022, + author = {Touvron, Hugo and Cord, Matthieu and J{\'e}gou, Herv{\'e}}, + booktitle = {Computer Vision -- ECCV 2022}, + title = {DeiT III: Revenge of the ViT}, + editor = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni Maria and Hassner, Tal}, + pages = {516--533}, + publisher = {Springer Nature Switzerland}, + address = {Cham}, + file = {:Touvron2022 - DeiT III_ Revenge of the ViT.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Dosovitskiy2021, + author = {Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby}, + booktitle = {9th International Conference on Learning Representations, {ICLR} 2021, Virtual Event, Austria, May 3-7, 2021}, + title = {An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, + publisher = {OpenReview.net}, + file = {:Dosovitskiy2021 - An Image Is Worth 16x16 Words_ Transformers for Image Recognition at Scale.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Tolstikhin2021, + author = {Ilya Tolstikhin and Neil Houlsby and Alexander Kolesnikov and Lucas Beyer and Xiaohua Zhai and Thomas Unterthiner and Jessica Yung and Andreas Peter Steiner and Daniel Keysers and Jakob Uszkoreit and Mario Lucic and Alexey Dosovitskiy}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {{MLP}-Mixer: An all-{MLP} Architecture for Vision}, + editor = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan}, + eprint = {2105.01601}, + pages = {24261--24272}, + url = {https://openreview.net/forum?id=EI2KOXKdnP}, + archiveprefix = {arXiv}, + file = {:Tolstikhin2021 - MLP Mixer_ an All MLP Architecture for Vision.pdf:PDF}, + primaryclass = {cs.CV}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Huang2020, + author = {Huang, Xiao Shi and Perez, Felipe and Ba, Jimmy and Volkovs, Maksims}, + booktitle = {Proceedings of the 37th International Conference on Machine Learning}, + title = {Improving Transformer Optimization Through Better Initialization}, + editor = {III, Hal Daumé and Singh, Aarti}, + pages = {4475--4483}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + volume = {119}, + file = {:Huang2020 - Improving Transformer Optimization through Better Initialization.pdf:PDF}, + month = {13--18 Jul}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2020}, +} + +@InProceedings{Liang2022a, + author = {Youwei Liang and Chongjian Ge and Zhan Tong and Yibing Song and Jue Wang and Pengtao Xie}, + booktitle = {International Conference on Learning Representations}, + title = {Not All Patches are What You Need: Expediting Vision Transformers via Token Reorganizations}, + comment = {EViT}, + file = {:Liang2022a - Not All Patches Are What You Need_ Expediting Vision Transformers Via Token Reorganizations.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Bartoldson2023, + author = {Bartoldson, Brian R and Kailkhura, Bhavya and Blalock, Davis}, + title = {Compute-efficient deep learning: Algorithmic trends and opportunities}, + number = {122}, + pages = {1--77}, + volume = {24}, + file = {:Bartoldson2022 - Compute Efficient Deep Learning_ Algorithmic Trends and Opportunities.pdf:PDF}, + journal = {Journal of Machine Learning Research}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Tay2022, + author = {Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler, Donald}, + title = {Efficient Transformers: A Survey}, + doi = {10.1145/3530811}, + address = {New York, NY, USA}, + file = {:Tay2022 - Efficient Transformers_ a Survey.pdf:PDF}, + journal = {ACM Comput. Surv.}, + month = {4}, + publisher = {Association for Computing Machinery}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Xu2022, + author = {Xu, Jiarui and De Mello, Shalini and Liu, Sifei and Byeon, Wonmin and Breuel, Thomas and Kautz, Jan and Wang, Xiaolong}, + title = {GroupViT: Semantic Segmentation Emerges from Text Supervision}, + doi = {10.48550/arXiv.2202.11094}, + file = {:Xu2022 - GroupViT_ Semantic Segmentation Emerges from Text Supervision.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Misc{Selva2022, + author = {Selva, Javier and Johansen, Anders S. and Escalera, Sergio and Nasrollahi, Kamal and Moeslund, Thomas B. and Clapés, Albert}, + title = {Video Transformers: A Survey}, + doi = {10.48550/ARXIV.2201.05991}, + file = {:Selva2022 - Video Transformers_ a Survey.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2022}, +} + +@InProceedings{Dehghani2022, + author = {Mostafa Dehghani and Yi Tay and Anurag Arnab and Lucas Beyer and Ashish Vaswani}, + booktitle = {International Conference on Learning Representations}, + title = {The Efficiency Misnomer}, + file = {:Dehghani2022 - The Efficiency Misnomer.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Zhao2021, + author = {Bo Zhao and Konda Reddy Mopuri and Hakan Bilen}, + booktitle = {International Conference on Learning Representations}, + title = {Dataset Condensation with Gradient Matching}, + file = {:Zhao2021 - Dataset Condensation with Gradient Matching.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Huang2016, + author = {Huang, Gao and Sun, Yu and Liu, Zhuang and Sedra, Daniel and Weinberger, Kilian Q.}, + booktitle = {Computer Vision -- ECCV 2016}, + title = {Deep Networks with Stochastic Depth}, + editor = {Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max}, + pages = {646--661}, + publisher = {Springer International Publishing}, + address = {Cham}, + file = {:Huang2016 - Deep Networks with Stochastic Depth.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2016}, +} + +@InProceedings{Hu2018, + author = {Hu, Jie and Shen, Li and Sun, Gang}, + booktitle = {2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + title = {Squeeze-and-Excitation Networks}, + doi = {10.1109/CVPR.2018.00745}, + pages = {7132-7141}, + file = {:Hu2018 - Squeeze and Excitation Networks.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2018}, +} + +@Article{Rao2021, + author = {Rao, Yongming and Zhao, Wenliang and Zhu, Zheng and Lu, Jiwen and Zhou, Jie}, + title = {Global Filter Networks for Image Classification}, + volume = {34}, + file = {:Rao2021 - Global Filter Networks for Image Classification.pdf:PDF}, + journal = {Advances in Neural Information Processing Systems}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Misc{Wang2020, + author = {{Wang}, Sinong and {Li}, Belinda Z. and {Khabsa}, Madian and {Fang}, Han and {Ma}, Hao}, + title = {Linformer: Self-Attention with Linear Complexity}, + doi = {10.48550/arxiv.2006.04768}, + eprint = {2006.04768}, + archiveprefix = {arXiv}, + file = {:Wang2020 - Linformer_ Self Attention with Linear Complexity.pdf:PDF}, + month = jun, + primaryclass = {cs.LG}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@InProceedings{Choromanski2021, + author = {Krzysztof Marcin Choromanski and Valerii Likhosherstov and David Dohan and Xingyou Song and Andreea Gane and Tamas Sarlos and Peter Hawkins and Jared Quincy Davis and Afroz Mohiuddin and Lukasz Kaiser and David Benjamin Belanger and Lucy J Colwell and Adrian Weller}, + booktitle = {International Conference on Learning Representations}, + title = {Rethinking Attention with Performers}, + file = {:Choromanski2021 - Rethinking Attention with Performers.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Fournier2023, + author = {Fournier, Quentin and Caron, Ga\'{e}tan Marceau and Aloise, Daniel}, + title = {A Practical Survey on Faster and Lighter Transformers}, + doi = {10.1145/3586074}, + issn = {0360-0300}, + address = {New York, NY, USA}, + file = {:Fournier2023 - A Practical Survey on Faster and Lighter Transformers.pdf:PDF}, + journal = {ACM Comput. Surv.}, + month = {3}, + publisher = {Association for Computing Machinery}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Marcus2020, + author = {{Marcus}, Gary}, + title = {{The Next Decade in AI: Four Steps Towards Robust Artificial Intelligence}}, + doi = {10.48550/arxiv.2002.06177}, + eprint = {2002.06177}, + archiveprefix = {arXiv}, + file = {:Marcus2020 - The Next Decade in AI_ Four Steps Towards Robust Artificial Intelligence.pdf:PDF}, + keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, I.2, I.2.6}, + month = feb, + primaryclass = {cs.AI}, + readstatus = {read}, + year = {2020}, +} + +@InProceedings{Palacio2021, + author = {Palacio, Sebastian and Engler, Philipp and Hees, Jörn and Dengel, Andreas}, + booktitle = {2020 25th International Conference on Pattern Recognition (ICPR)}, + title = {Contextual Classification Using Self-Supervised Auxiliary Models for Deep Neural Networks}, + doi = {10.1109/ICPR48806.2021.9412175}, + pages = {8937-8944}, + file = {:Palacio2021 - Contextual Classification Using Self Supervised Auxiliary Models for Deep Neural Networks.pdf:PDF}, + readstatus = {skimmed}, + year = {2021}, +} + +@Article{Yang2021, + author = {Yi Yang and Yueting Zhuang and Yunhe Pan}, + date = {2021-12}, + journaltitle = {Frontiers of Information Technology {\&}amp$\mathsemicolon$ Electronic Engineering}, + title = {Multiple knowledge representation for big data artificial intelligence: framework, applications, and case studies}, + doi = {10.1631/fitee.2100463}, + number = {12}, + pages = {1551--1558}, + volume = {22}, + file = {:Yang2021 - Multiple Knowledge Representation for Big Data Artificial Intelligence_ Framework, Applications, and Case Studies.pdf:PDF}, + publisher = {Zhejiang University Press}, + readstatus = {skimmed}, +} + +@InProceedings{Zhang2018, + author = {Zhao Zhang and Fuzhen Zhuang and Meng Qu and Fen Lin and Qing He}, + booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, + date = {2018}, + title = {Knowledge Graph Embedding with Hierarchical Relation Structure}, + doi = {10.18653/v1/d18-1358}, + publisher = {Association for Computational Linguistics}, + file = {:Zhang2018 - Knowledge Graph Embedding with Hierarchical Relation Structure.pdf:PDF}, +} + +@Article{Ding2018, + author = {Liya Ding}, + date = {2018}, + journaltitle = {Procedia Computer Science}, + title = {Human Knowledge in Constructing {AI} Systems {\textemdash} Neural Logic Networks Approach towards an Explainable {AI}}, + doi = {10.1016/j.procs.2018.08.129}, + pages = {1561--1570}, + volume = {126}, + file = {:Ding2018 - Human Knowledge in Constructing AI Systems _ Neural Logic Networks Approach Towards an Explainable AI.pdf:PDF}, + publisher = {Elsevier {BV}}, +} + +@Article{Jinfeng2020, + author = {Gao Jinfeng and Sehrish Qummar and Zhang Junming and Yao Ruxian and Fiaz Gul Khan}, + date = {2020-12}, + journaltitle = {Computational Intelligence and Neuroscience}, + title = {Ensemble Framework of Deep {CNNs} for Diabetic Retinopathy Detection}, + doi = {10.1155/2020/8864698}, + editor = {Elpida Keravnou}, + pages = {1--11}, + volume = {2020}, + file = {:Jinfeng2020 - Ensemble Framework of Deep CNNs for Diabetic Retinopathy Detection.pdf:PDF}, + publisher = {Hindawi Limited}, +} + +@InProceedings{CasadoGarcia2020, + author = {Casado-Garc{\'\i}a, {\'A}ngela and Heras, J{\'o}nathan}, + booktitle = {ECAI 2020}, + title = {Ensemble methods for object detection}, + pages = {2688--2695}, + publisher = {IOS Press}, + file = {:CasadoGarcia2020 - Ensemble Methods for Object Detection.pdf:PDF}, + year = {2020}, +} + +@Misc{Pardo2019, + author = {Pardo, Alejandro and Alwassel, Humam and Heilbron, Fabian Caba and Thabet, Ali and Ghanem, Bernard}, + date = {2019}, + title = {RefineLoc: Iterative Refinement for Weakly-Supervised Action Localization}, + doi = {10.48550/ARXIV.1904.00227}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Pardo2019 - RefineLoc_ Iterative Refinement for Weakly Supervised Action Localization.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, +} + +@InProceedings{Moon2019, + author = {Seungwhan Moon and Pararth Shah and Anuj Kumar and Rajen Subba}, + booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, + date = {2019}, + title = {{OpenDialKG}: Explainable Conversational Reasoning with Attention-based Walks over Knowledge Graphs}, + doi = {10.18653/v1/p19-1081}, + publisher = {Association for Computational Linguistics}, + file = {:Moon2019 - OpenDialKG_ Explainable Conversational Reasoning with Attention Based Walks Over Knowledge Graphs.pdf:PDF}, +} + +@InCollection{Azimi2019, + author = {Fatemeh Azimi and Federico Raue and Jörn Hees and Andreas Dengel}, + booktitle = {Artificial Neural Networks and Machine Learning {\textendash} {ICANN} 2019: Theoretical Neural Computation}, + date = {2019}, + title = {A Reinforcement Learning Approach for Sequential Spatial Transformer Networks}, + doi = {10.1007/978-3-030-30487-4_45}, + pages = {585--597}, + publisher = {Springer International Publishing}, + file = {:Azimi2019 - A Reinforcement Learning Approach for Sequential Spatial Transformer Networks.pdf:PDF}, +} + +@Misc{Azimi2021, + author = {Azimi, Fatemeh and Nies, Jean-Francois Jacques Nicolas and Palacio, Sebastian and Raue, Federico and Hees, Jörn and Dengel, Andreas}, + date = {2021}, + title = {Spatial Transformer Networks for Curriculum Learning}, + doi = {10.48550/ARXIV.2108.09696}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Azimi2021 - Spatial Transformer Networks for Curriculum Learning.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, +} + +@Article{Vinuesa2020, + author = {Ricardo Vinuesa and Hossein Azizpour and Iolanda Leite and Madeline Balaam and Virginia Dignum and Sami Domisch and Anna Felländer and Simone Daniela Langhans and Max Tegmark and Francesco Fuso Nerini}, + date = {2020-01}, + journaltitle = {Nature Communications}, + title = {The role of artificial intelligence in achieving the Sustainable Development Goals}, + doi = {10.1038/s41467-019-14108-y}, + number = {1}, + volume = {11}, + file = {:Vinuesa2020 - The Role of Artificial Intelligence in Achieving the Sustainable Development Goals.pdf:PDF}, + publisher = {Springer Science and Business Media {LLC}}, + readstatus = {skimmed}, +} + +@InCollection{Kamath2022, + author = {Amita Kamath and Christopher Clark and Tanmay Gupta and Eric Kolve and Derek Hoiem and Aniruddha Kembhavi}, + booktitle = {Lecture Notes in Computer Science}, + date = {2022}, + title = {Webly Supervised Concept Expansion for~General Purpose Vision Models}, + doi = {10.1007/978-3-031-20059-5_38}, + pages = {662--681}, + publisher = {Springer Nature Switzerland}, + file = {:Kamath2022 - Webly Supervised Concept Expansion For~General Purpose Vision Models.pdf:PDF}, +} + +@Article{Xu2023c, + author = {Xu, Peng and Zhu, Xiatian and Clifton, David A.}, + title = {Multimodal Learning With Transformers: A Survey}, + doi = {10.1109/TPAMI.2023.3275156}, + number = {10}, + pages = {12113-12132}, + volume = {45}, + file = {:Xu2022a - Multimodal Learning with Transformers_ a Survey.pdf:PDF}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + keywords = {Transformers;Task analysis;Surveys;Visualization;Taxonomy;Mathematical models;Data models;Multimodal learning;transformer;introductory;taxonomy;deep learning;machine learning}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2023}, +} + +@Misc{Sevim2022, + author = {Sevim, Nurullah and Özyedek, Ege Ozan and Şahinuç, Furkan and Koç, Aykut}, + date = {2022}, + title = {Fast-FNet: Accelerating Transformer Encoder Models via Efficient Fourier Layers}, + doi = {10.48550/ARXIV.2209.12816}, + eprint = {2209.12816}, + archiveprefix = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Sevim2022 - Fast FNet_ Accelerating Transformer Encoder Models Via Efficient Fourier Layers.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), General Literature (cs.GL), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + publisher = {arXiv}, + readstatus = {skimmed}, + year = {2022}, +} + +@InProceedings{Xu2020, + author = {Hongfei Xu and Qiuhui Liu and Josef van Genabith and Deyi Xiong and Jingyi Zhang}, + booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, + date = {2020}, + title = {Lipschitz Constrained Parameter Initialization for Deep Transformers}, + doi = {10.18653/v1/2020.acl-main.38}, + publisher = {Association for Computational Linguistics}, + file = {:Xu2020 - Lipschitz Constrained Parameter Initialization for Deep Transformers.pdf:PDF}, + readstatus = {skimmed}, +} + +@InCollection{Touvron2022a, + author = {Hugo Touvron and Matthieu Cord and Alaaeldin El-Nouby and Jakob Verbeek and Herv{\'{e}} J{\'{e}}gou}, + booktitle = {Lecture Notes in Computer Science}, + date = {2022}, + title = {Three Things Everyone Should Know About Vision Transformers}, + doi = {10.1007/978-3-031-20053-3_29}, + pages = {497--515}, + publisher = {Springer Nature Switzerland}, + file = {:Touvron2022a - Three Things Everyone Should Know about Vision Transformers.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, +} + +@Misc{Zhang2022, + author = {Zhang, Daniel and Maslej, Nestor and Brynjolfsson, Erik and Etchemendy, John and Lyons, Terah and Manyika, James and Ngo, Helen and Niebles, Juan Carlos and Sellitto, Michael and Sakhaee, Ellie and Shoham, Yoav and Clark, Jack and Perrault, Raymond}, + date = {2022}, + title = {The AI Index 2022 Annual Report}, + doi = {10.48550/ARXIV.2205.03468}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Zhang2022 - The AI Index 2022 Annual Report.pdf:PDF}, + groups = {Misc}, + keywords = {Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, +} + +@InProceedings{Jiang2021, + author = {Zihang Jiang and Qibin Hou and Li Yuan and Zhou Daquan and Yujun Shi and Xiaojie Jin and Anran Wang and Jiashi Feng}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {All Tokens Matter: Token Labeling for Training Better Vision Transformers}, + editor = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan}, + url = {https://openreview.net/forum?id=2vubO341F_E}, + file = {:Jiang2021 - All Tokens Matter_ Token Labeling for Training Better Vision Transformers.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Rahate2022, + author = {Anil Rahate and Rahee Walambe and Sheela Ramanna and Ketan Kotecha}, + date = {2022-05}, + journaltitle = {Information Fusion}, + title = {Multimodal Co-learning: Challenges, applications with datasets, recent advances and future directions}, + doi = {10.1016/j.inffus.2021.12.003}, + pages = {203--239}, + volume = {81}, + file = {:Rahate2022 - Multimodal Co Learning_ Challenges, Applications with Datasets, Recent Advances and Future Directions.pdf:PDF}, + publisher = {Elsevier {BV}}, +} + +@InProceedings{Liu2022, + author = {Zhuang Liu and Hanzi Mao and Chao-Yuan Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie}, + booktitle = {2022 {IEEE}/{CVF} Conference on Computer Vision and Pattern Recognition ({CVPR})}, + date = {2022-06}, + title = {A {ConvNet} for the 2020s}, + doi = {10.1109/cvpr52688.2022.01167}, + publisher = {{IEEE}}, + file = {:Liu2022 - A ConvNet for the 2020s.pdf:PDF}, + keywords = {ConvNext}, +} + +@InProceedings{Rao2021a, + author = {Rao, Yongming and Zhao, Wenliang and Liu, Benlin and Lu, Jiwen and Zhou, Jie and Hsieh, Cho-Jui}, + booktitle = {Advances in Neural Information Processing Systems (NeurIPS)}, + title = {DynamicViT: Efficient Vision Transformers with Dynamic Token Sparsification}, + editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan}, + pages = {13937--13949}, + publisher = {Curran Associates, Inc.}, + url = {https://proceedings.neurips.cc/paper/2021/file/747d3443e319a22747fbb873e8b2f9f2-Paper.pdf}, + volume = {34}, + file = {:Rao2021a - DynamicViT_ Efficient Vision Transformers with Dynamic Token Sparsification.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Yun2019, + author = {Sangdoo Yun and Dongyoon Han and Sanghyuk Chun and Seong Joon Oh and Youngjoon Yoo and Junsuk Choe}, + booktitle = {2019 {IEEE}/{CVF} International Conference on Computer Vision ({ICCV})}, + date = {2019-10}, + title = {{CutMix}: Regularization Strategy to Train Strong Classifiers With Localizable Features}, + doi = {10.1109/iccv.2019.00612}, + publisher = {{IEEE}}, + file = {:Yun2019 - CutMix_ Regularization Strategy to Train Strong Classifiers with Localizable Features.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2019}, +} + +@InProceedings{Touvron2021a, + author = {Hugo Touvron and Matthieu Cord and Alexandre Sablayrolles and Gabriel Synnaeve and Herve Jegou}, + booktitle = {2021 {IEEE}/{CVF} International Conference on Computer Vision ({ICCV})}, + date = {2021-10}, + title = {Going deeper with Image Transformers}, + doi = {10.1109/iccv48922.2021.00010}, + publisher = {{IEEE}}, + file = {:Touvron2021a - Going Deeper with Image Transformers.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Touvron2021b, + author = {Touvron, Hugo and Cord, Matthieu and Douze, Matthijs and Massa, Francisco and Sablayrolles, Alexandre and Jegou, Herve}, + booktitle = {Proceedings of the 38th International Conference on Machine Learning}, + title = {Training data-efficient image transformers \& distillation through attention}, + editor = {Meila, Marina and Zhang, Tong}, + pages = {10347--10357}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + url = {https://proceedings.mlr.press/v139/touvron21a.html}, + volume = {139}, + file = {:Touvron2021b - Training Data Efficient Image Transformers & Distillation through Attention.pdf:PDF}, + month = {7}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Deng2009, + author = {Jia Deng and Wei Dong and Richard Socher and Li-Jia Li and Kai Li and Li Fei-Fei}, + booktitle = {2009 {IEEE} Conference on Computer Vision and Pattern Recognition}, + date = {2009-06}, + title = {{ImageNet}: A large-scale hierarchical image database}, + doi = {10.1109/cvpr.2009.5206848}, + publisher = {{IEEE}}, + file = {:Deng2009 - ImageNet_ a Large Scale Hierarchical Image Database.pdf:PDF}, + year = {2009}, +} + +@InProceedings{Ridnik2021, + author = {Tal Ridnik and Emanuel Ben-Baruch and Asaf Noy and Lihi Zelnik-Manor}, + booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)}, + title = {ImageNet-21K Pretraining for the Masses}, + eprint = {2104.10972}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=Zkj_VcZ6ol}, + archiveprefix = {arXiv}, + file = {:Ridnik2021 - ImageNet 21K Pretraining for the Masses.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Zhang2018a, + author = {Hongyi Zhang and Moustapha Cisse and Yann N. Dauphin and David Lopez-Paz}, + booktitle = {International Conference on Learning Representations}, + title = {mixup: Beyond Empirical Risk Minimization}, + url = {https://openreview.net/forum?id=r1Ddp1-Rb}, + file = {:Zhang2018a - Mixup_ beyond Empirical Risk Minimization.pdf:PDF}, + year = {2018}, +} + +@Article{Wang2018, + author = {Tongzhou Wang and Jun-Yan Zhu and Antonio Torralba and Alexei A. Efros}, + date = {2018-11-27}, + title = {Dataset Distillation}, + doi = {10.48550/arxiv.1811.10959}, + eprint = {1811.10959}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + file = {:Wang2018 - Dataset Distillation.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + journal = {arXiv preprint arXiv:1811.10959}, + keywords = {cs.LG, stat.ML}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2018}, +} + +@Article{Bohdal2020, + author = {Ondrej Bohdal and Yongxin Yang and Timothy Hospedales}, + date = {2020-06-15}, + title = {Flexible Dataset Distillation: Learn Labels Instead of Images}, + doi = {10.48550/arxiv.2006.08572}, + eprint = {2006.08572}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + comment = {Presented at NeurIPS 2020}, + file = {:Bohdal2020 - Flexible Dataset Distillation_ Learn Labels Instead of Images.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + keywords = {cs.LG, stat.ML}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@InProceedings{Nguyen2021a, + author = {Timothy Nguyen and Roman Novak and Lechao Xiao and Jaehoon Lee}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {Dataset Distillation with Infinitely Wide Convolutional Networks}, + editor = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan}, + url = {https://openreview.net/forum?id=hXWPpJedrVP}, + file = {:Nguyen2021a - Dataset Distillation with Infinitely Wide Convolutional Networks.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2021}, +} + +@Article{Zhao2021a, + author = {Bo Zhao and Hakan Bilen}, + date = {2021-02-16}, + journaltitle = {International Conference on Machine Learning 2021}, + title = {Dataset Condensation with Differentiable Siamese Augmentation}, + doi = {10.48550/arxiv.2102.08259}, + eprint = {2102.08259}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + file = {:Zhao2021a - Dataset Condensation with Differentiable Siamese Augmentation.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + keywords = {cs.LG, cs.CV}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@InProceedings{Sucholutsky2021, + author = {Ilia Sucholutsky and Matthias Schonlau}, + booktitle = {2021 International Joint Conference on Neural Networks (IJCNN)}, + date = {2019-10-06}, + title = {Soft-Label Dataset Distillation and Text Dataset Distillation}, + doi = {10.1109/IJCNN52387.2021.9533769}, + eprint = {1910.02551}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + organization = {IEEE}, + pages = {1--8}, + file = {:Sucholutsky2021 - Soft Label Dataset Distillation and Text Dataset Distillation.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + keywords = {cs.LG, cs.AI, stat.ML}, + year = {2021}, +} + +@Article{Asano2021, + author = {Yuki M. Asano and Aaqib Saeed}, + date = {2021-12-01}, + title = {Extrapolating from a Single Image to a Thousand Classes using Distillation}, + doi = {10.48550/arxiv.2112.00725}, + eprint = {2112.00725}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + file = {:Asano2021 - Extrapolating from a Single Image to a Thousand Classes Using Distillation.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + keywords = {cs.CV}, + readstatus = {skimmed}, +} + +@Article{Cazenavette2022, + author = {George Cazenavette and Tongzhou Wang and Antonio Torralba and Alexei A. Efros and Jun-Yan Zhu and George Cazenavette and Tongzhou Wang and Antonio Torralba and Alexei A. Efros and Jun-Yan Zhu}, + date = {2022-03-22}, + title = {Dataset Distillation by Matching Training Trajectories}, + doi = {10.48550/arxiv.2203.11932}, + eprint = {2203.11932}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + file = {:Cazenavette2022 - Dataset Distillation by Matching Training Trajectories.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + keywords = {cs.CV, cs.AI, cs.LG}, + readstatus = {skimmed}, + year = {2022}, +} + +@InProceedings{Cazenavette2022a, + author = {G. Cazenavette and T. Wang and A. Torralba and A. A. Efros and J. Zhu}, + booktitle = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)}, + title = {Wearable ImageNet: Synthesizing Tileable Textures via Dataset Distillation}, + doi = {10.1109/CVPRW56347.2022.00252}, + pages = {2277-2281}, + publisher = {IEEE Computer Society}, + address = {Los Alamitos, CA, USA}, + file = {:Cazenavette2022a - Wearable ImageNet_ Synthesizing Tileable Textures Via Dataset Distillation.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + month = {jun}, + readstatus = {skimmed}, + year = {2022}, +} + +@InProceedings{Wang2022, + author = {Wang, Kai and Zhao, Bo and Peng, Xiangyu and Zhu, Zheng and Yang, Shuo and Wang, Shuo and Huang, Guan and Bilen, Hakan and Wang, Xinchao and You, Yang}, + booktitle = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title = {CAFE: Learning to Condense Dataset by Aligning Features}, + doi = {10.1109/CVPR52688.2022.01188}, + pages = {12186-12195}, + file = {:Wang2022 - CAFE_ Learning to Condense Dataset by Aligning Features.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + year = {2022}, +} + +@Article{Lee2022, + author = {Hae Beom Lee and Dong Bok Lee and Sung Ju Hwang}, + date = {2022-08-21}, + title = {Dataset Condensation with Latent Space Knowledge Factorization and Sharing}, + doi = {10.48550/arxiv.2208.10494}, + eprint = {2208.10494}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + file = {:Lee2022 - Dataset Condensation with Latent Space Knowledge Factorization and Sharing.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + keywords = {cs.LG, cs.AI}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@InProceedings{Zhou2022, + author = {Yongchao Zhou and Ehsan Nezhadarya and Jimmy Ba}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {Dataset Distillation using Neural Feature Regression}, + editor = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho}, + url = {https://openreview.net/forum?id=2clwrA2tfik}, + file = {:Zhou2022 - Dataset Distillation Using Neural Feature Regression.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + year = {2022}, +} + +@InProceedings{Liu2022a, + author = {Songhua Liu and Kai Wang and Xingyi Yang and Jingwen Ye and Xinchao Wang}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {Dataset Distillation via Factorization}, + editor = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho}, + url = {https://openreview.net/forum?id=luGXvawYWJ}, + file = {:Liu2022a - Dataset Distillation Via Factorization.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + year = {2022}, +} + +@InProceedings{Kim2022, + author = {Kim, Jang-Hyun and Kim, Jinuk and Oh, Seong Joon and Yun, Sangdoo and Song, Hwanjun and Jeong, Joonhyun and Ha, Jung-Woo and Song, Hyun Oh}, + booktitle = {Proceedings of the 39th International Conference on Machine Learning}, + title = {Dataset Condensation via Efficient Synthetic-Data Parameterization}, + editor = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan}, + pages = {11102--11118}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + url = {https://proceedings.mlr.press/v162/kim22c.html}, + volume = {162}, + file = {:Kim2022 - Dataset Condensation Via Efficient Synthetic Data Parameterization.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + month = {17--23 Jul}, + pdf = {https://proceedings.mlr.press/v162/kim22c/kim22c.pdf}, + year = {2022}, +} + +@InProceedings{Lee2022a, + author = {Saehyung Lee and Sanghyuk Chun and Sangwon Jung and Sangdoo Yun and Sungroh Yoon}, + booktitle = {Proceedings of the 39th International Conference on Machine Learning}, + date = {2022-02-07}, + title = {Dataset Condensation with Contrastive Signals}, + doi = {10.48550/arxiv.2202.02916}, + eprint = {2202.02916}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + file = {:http\://arxiv.org/pdf/2202.02916v3:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + keywords = {cs.CV, cs.LG}, +} + +@InProceedings{Thilak2022, + author = {Vimal Thilak and Etai Littwin and Shuangfei Zhai and Omid Saremi and Roni Paiss and Joshua Susskind and Vimal Thilak and Etai Littwin and Shuangfei Zhai and Omid Saremi and Roni Paiss and Joshua M. Susskind}, + booktitle = {Has it Trained Yet? NeurIPS 2022 Workshop}, + date = {2022-06-10}, + title = {The Slingshot Mechanism: An Empirical Study of Adaptive Optimizers and the Grokking Phenomenon}, + doi = {10.48550/arxiv.2206.04817}, + eprint = {2206.04817}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=lY1e0PNkSJ}, + file = {:Thilak2022 - The Slingshot Mechanism_ an Empirical Study of Adaptive Optimizers and the Grokking Phenomenon.pdf:PDF}, + keywords = {cs.LG, math.OC}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Katharopoulos2018, + author = {Katharopoulos, Angelos and Fleuret, Fran{\c{c}}ois}, + booktitle = {International conference on machine learning}, + title = {Not all samples are created equal: Deep learning with importance sampling}, + organization = {PMLR}, + pages = {2525--2534}, + file = {:Katharopoulos2018 - Not All Samples Are Created Equal_ Deep Learning with Importance Sampling.pdf:PDF}, + groups = {Dataset Distillation Survey, Importance Sampling}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2018}, +} + +@Article{Vodrahalli2018, + author = {Kailas Vodrahalli and Ke Li and Jitendra Malik}, + date = {2018-11-30}, + title = {Are All Training Examples Created Equal? An Empirical Study}, + doi = {10.48550/arxiv.1811.12569}, + eprint = {1811.12569}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + file = {:Vodrahalli2018 - Are All Training Examples Created Equal_ an Empirical Study.pdf:PDF}, + groups = {Importance Sampling, Dataset Distillation Survey}, + keywords = {cs.LG, cs.CV, stat.ML}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@Article{Lapedriza2013, + author = {Agata Lapedriza and Hamed Pirsiavash and Zoya Bylinskii and Antonio Torralba}, + date = {2013-11-25}, + title = {Are all training examples equally valuable?}, + doi = {10.48550/arxiv.1311.6510}, + eprint = {1311.6510}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + file = {:Lapedriza2013 - Are All Training Examples Equally Valuable_.pdf:PDF}, + groups = {Importance Sampling, Dataset Distillation Survey}, + keywords = {cs.CV, cs.LG, stat.ML}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@InProceedings{Coleman2020, + author = {Cody Coleman and Christopher Yeh and Stephen Mussmann and Baharan Mirzasoleiman and Peter Bailis and Percy Liang and Jure Leskovec and Matei Zaharia}, + booktitle = {International Conference on Learning Representations}, + title = {Selection via Proxy: Efficient Data Selection for Deep Learning}, + url = {https://openreview.net/forum?id=HJg2b0VYDr}, + file = {:Coleman2020 - Selection Via Proxy_ Efficient Data Selection for Deep Learning.pdf:PDF}, + groups = {Dataset Distillation Survey}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2020}, +} + +@Article{Shleifer2019, + author = {Sam Shleifer and Eric Prokop}, + date = {2019-06-12}, + title = {Proxy Datasets for Training Convolutional Neural Networks}, + doi = {10.48550/arxiv.1906.04887}, + eprint = {1906.04887}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + file = {:Shleifer2019 - Proxy Datasets for Training Convolutional Neural Networks.pdf:PDF}, + groups = {Importance Sampling, Dataset Distillation Survey}, + keywords = {cs.LG, cs.CV, stat.ML}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@Article{Zhangs2019, + author = {Cheng Zhangs and Cengiz Öztireli and Stephan Mandt and Giampiero Salvi}, + date = {2019-07}, + journaltitle = {Proceedings of the {AAAI} Conference on Artificial Intelligence}, + title = {Active Mini-Batch Sampling Using Repulsive Point Processes}, + doi = {10.1609/aaai.v33i01.33015741}, + number = {01}, + pages = {5741--5748}, + volume = {33}, + file = {:Zhangs2019 - Active Mini Batch Sampling Using Repulsive Point Processes.pdf:PDF}, + groups = {Dataset Distillation Survey, Importance Sampling}, + publisher = {Association for the Advancement of Artificial Intelligence ({AAAI})}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@Article{Loshchilov2015, + author = {Ilya Loshchilov and Frank Hutter}, + date = {2015-11-19}, + journaltitle = {Workshop Track - ICLR 2016}, + title = {Online Batch Selection for Faster Training of Neural Networks}, + doi = {10.48550/arxiv.1511.06343}, + eprint = {1511.06343}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + file = {:Loshchilov2015 - Online Batch Selection for Faster Training of Neural Networks.pdf:PDF}, + groups = {Dataset Distillation Survey, Importance Sampling}, + keywords = {cs.LG, cs.NE, math.OC}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@InProceedings{Fan2018, + author = {Yang Fan and Fei Tian and Tao Qin and Xiang-Yang Li and Tie-Yan Liu}, + booktitle = {International Conference on Learning Representations}, + title = {Learning to Teach}, + url = {https://openreview.net/forum?id=HJewuJWCZ}, + file = {:Fan2018 - Learning to Teach.pdf:PDF}, + groups = {Dataset Distillation Survey, Importance Sampling}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2018}, +} + +@InProceedings{Shu2019, + author = {Jun Shu and Qi Xie and Lixuan Yi and Qian Zhao and Sanping Zhou and Zongben Xu and Deyu Meng}, + booktitle = {Advances in Neural Information Processing Systems}, + date = {2019-02-20}, + title = {Meta-Weight-Net: Learning an Explicit Mapping For Sample Weighting}, + doi = {10.48550/arxiv.1902.07379}, + editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett}, + eprint = {1902.07379}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + publisher = {Curran Associates, Inc.}, + url = {https://proceedings.neurips.cc/paper/2019/file/e58cc5ca94270acaceed13bc82dfedf7-Paper.pdf}, + volume = {32}, + file = {:Shu2019 - Meta Weight Net_ Learning an Explicit Mapping for Sample Weighting.pdf:PDF}, + groups = {Dataset Distillation Survey, Importance Sampling}, + keywords = {cs.LG, stat.ML}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2019}, +} + +@InProceedings{Ren2018, + author = {Ren, Mengye and Zeng, Wenyuan and Yang, Bin and Urtasun, Raquel}, + booktitle = {Proceedings of the 35th International Conference on Machine Learning}, + title = {Learning to Reweight Examples for Robust Deep Learning}, + editor = {Dy, Jennifer and Krause, Andreas}, + pages = {4334--4343}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + url = {https://proceedings.mlr.press/v80/ren18a.html}, + volume = {80}, + file = {:Ren2018 - Learning to Reweight Examples for Robust Deep Learning.pdf:PDF}, + groups = {Dataset Distillation Survey, Importance Sampling}, + month = {10--15 Jul}, + pdf = {http://proceedings.mlr.press/v80/ren18a/ren18a.pdf}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2018}, +} + +@Article{Elman1993, + author = {Jeffrey L. Elman}, + date = {1993-07}, + journaltitle = {Cognition}, + title = {Learning and development in neural networks: The importance of starting small}, + doi = {10.1016/0010-0277(93)90058-4}, + number = {1}, + pages = {71--99}, + volume = {48}, + file = {:Elman1993 - Learning and Development in Neural Networks_ the Importance of Starting Small.pdf:PDF}, + groups = {Importance Sampling, Dataset Distillation Survey}, + publisher = {Elsevier {BV}}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@InProceedings{Bengio2009, + author = {Bengio, Yoshua and Louradour, J\'{e}r\^{o}me and Collobert, Ronan and Weston, Jason}, + booktitle = {Proceedings of the 26th Annual International Conference on Machine Learning}, + title = {Curriculum Learning}, + doi = {10.1145/1553374.1553380}, + isbn = {9781605585161}, + location = {Montreal, Quebec, Canada}, + pages = {41–48}, + publisher = {Association for Computing Machinery}, + series = {ICML '09}, + address = {New York, NY, USA}, + file = {:Bengio2009 - Curriculum Learning.pdf:PDF}, + groups = {Importance Sampling, Dataset Distillation Survey}, + numpages = {8}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2009}, +} + +@InProceedings{Jiang2017, + author = {Lu Jiang and Zhengyuan Zhou and Thomas Leung and Li-Jia Li and Li Fei-Fei}, + date = {2017-12-14}, + title = {MentorNet: Learning Data-Driven Curriculum for Very Deep Neural Networks on Corrupted Labels}, + doi = {10.48550/arxiv.1712.05055}, + eprint = {1712.05055}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + location = {Stockholm, Sweden}, + file = {:Jiang2017 - MentorNet_ Learning Data Driven Curriculum for Very Deep Neural Networks on Corrupted Labels.pdf:PDF}, + groups = {Dataset Distillation Survey, Importance Sampling}, + journaltitle = {Proceedings of the 35 th International Conference on MachineLearning}, + keywords = {cs.CV}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@InProceedings{Lee2021, + author = {Sangho Lee and Jiwan Chung and Youngjae Yu and Gunhee Kim and Thomas Breuel and Gal Chechik and Yale Song}, + booktitle = {ICCV}, + title = {ACAV100M: Automatic Curation of Large-Scale Datasets for Audio-Visual Video Representation Learning}, + doi = {10.48550/arxiv.2101.10803}, + eprint = {2101.10803}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + file = {:Lee2021 - ACAV100M_ Automatic Curation of Large Scale Datasets for Audio Visual Video Representation Learning.pdf:PDF}, + groups = {Dataset Distillation Survey, Pruning}, + keywords = {cs.CV}, + readstatus = {skimmed}, + year = {2021}, +} + +@InProceedings{Li2019, + author = {Li, Junnan and Wong, Yongkang and Zhao, Qi and Kankanhalli, Mohan S}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + title = {Learning to learn from noisy labeled data}, + pages = {5051--5059}, + file = {:Li2019 - Learning to Learn from Noisy Labeled Data.pdf:PDF}, + groups = {Dataset Distillation Survey, Noisy Labels}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2019}, +} + +@Article{Wan2022, + author = {Zhijing Wan and Zhixiang Wang and CheukTing Chung and Zheng Wang}, + date = {2022-10-21}, + title = {A Survey of Data Optimization for Problems in Computer Vision Datasets}, + doi = {10.48550/arxiv.2210.11717}, + eprint = {2210.11717}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + file = {:Wan2022 - A Survey of Data Optimization for Problems in Computer Vision Datasets.pdf:PDF}, + groups = {Dataset Distillation Survey, Surveys}, + keywords = {cs.CV, A.1}, +} + +@InProceedings{Wortsman2022, + author = {Wortsman, Mitchell and Ilharco, Gabriel and Gadre, Samir Ya and Roelofs, Rebecca and Gontijo-Lopes, Raphael and Morcos, Ari S and Namkoong, Hongseok and Farhadi, Ali and Carmon, Yair and Kornblith, Simon and Schmidt, Ludwig}, + booktitle = {Proceedings of the 39th International Conference on Machine Learning}, + title = {Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time}, + editor = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan}, + pages = {23965--23998}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + url = {https://proceedings.mlr.press/v162/wortsman22a.html}, + volume = {162}, + file = {:Wortsman2022 - Model Soups_ Averaging Weights of Multiple Fine Tuned Models Improves Accuracy without Increasing Inference Time.pdf:PDF}, + month = {17--23 Jul}, + pdf = {https://proceedings.mlr.press/v162/wortsman22a/wortsman22a.pdf}, + year = {2022}, +} + +@Article{Dwivedi2022, + author = {Dwivedi, Vijay Prakash and Joshi, Chaitanya K. and Luu, Anh Tuan and Laurent, Thomas and Bengio, Yoshua and Bresson, Xavier}, + date = {2022-12-28}, + journaltitle = {Journal of Machine Learning Research (JMLR), 2022}, + title = {Benchmarking Graph Neural Networks}, + doi = {10.48550/ARXIV.2003.00982}, + eprint = {2003.00982}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Dwivedi2022 - Benchmarking Graph Neural Networks.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@InProceedings{Nguyen2021, + author = {Timothy Nguyen and Zhourong Chen and Jaehoon Lee}, + booktitle = {International Conference on Learning Representations}, + title = {Dataset Meta-Learning from Kernel Ridge-Regression}, + url = {https://openreview.net/forum?id=l-PrrQrK0QR}, + file = {:Nguyen2021 - Dataset Meta Learning from Kernel Ridge Regression.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2021}, +} + +@Misc{Godbole2023, + author = {Varun Godbole and George E. Dahl and Justin Gilmer and Christopher J. Shallue and Zachary Nado}, + title = {Deep Learning Tuning Playbook}, + note = {Version 1.0}, + url = {https://github.com/google-research/tuning_playbook}, + file = {:Godbole2023 - Deep Learning Tuning Playbook.pdf:PDF}, + groups = {Misc}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Liu2022b, + author = {Liu, Rosanne and Garrette, Dan and Saharia, Chitwan and Chan, William and Roberts, Adam and Narang, Sharan and Blok, Irina and Mical, RJ and Norouzi, Mohammad and Constant, Noah}, + date = {2022-12-20}, + title = {Character-Aware Models Improve Visual Text Rendering}, + doi = {10.48550/ARXIV.2212.10562}, + eprint = {2212.10562}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Liu2022b - Character Aware Models Improve Visual Text Rendering.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + readstatus = {skimmed}, + year = {2022}, +} + +@Article{Peebles2022, + author = {Peebles, William and Xie, Saining}, + date = {2022-12-19}, + title = {Scalable Diffusion Models with Transformers}, + doi = {10.48550/ARXIV.2212.09748}, + eprint = {2212.09748}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Peebles2022 - Scalable Diffusion Models with Transformers.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Strubell2019, + author = {Emma Strubell and Ananya Ganesh and Andrew McCallum}, + booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, + date = {2019}, + title = {Energy and Policy Considerations for Deep Learning in {NLP}}, + doi = {10.18653/v1/p19-1355}, + publisher = {Association for Computational Linguistics}, + file = {:Strubell2019 - Energy and Policy Considerations for Deep Learning in NLP.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, +} + +@InProceedings{Dao2022, + author = {Tri Dao and Daniel Y Fu and Stefano Ermon and Atri Rudra and Christopher Re}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {FlashAttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness}, + editor = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho}, + url = {https://openreview.net/forum?id=H4DqfPSibmx}, + file = {:Dao2022 - FlashAttention_ Fast and Memory Efficient Exact Attention with IO Awareness.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2022}, +} + +@InProceedings{Schuster2022, + author = {Tal Schuster and Adam Fisch and Jai Gupta and Mostafa Dehghani and Dara Bahri and Vinh Q. Tran and Yi Tay and Donald Metzler}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {Confident Adaptive Language Modeling}, + editor = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho}, + url = {https://openreview.net/forum?id=uLYc4L3C81A}, + file = {:Schuster2022 - Confident Adaptive Language Modeling.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Schuurmans2023, + author = {Schuurmans, Dale}, + date = {2023-01-10}, + title = {Memory Augmented Large Language Models are Computationally Universal}, + doi = {10.48550/ARXIV.2301.04589}, + eprint = {2301.04589}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Schuurmans2023 - Memory Augmented Large Language Models Are Computationally Universal.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Formal Languages and Automata Theory (cs.FL), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Rahman2023, + author = {Rahman, Md Salman and Lee, Wonkwon}, + date = {2023-01-25}, + title = {Out of Distribution Performance of State of Art Vision Model}, + doi = {10.48550/ARXIV.2301.10750}, + eprint = {2301.10750}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Rahman2023 - Out of Distribution Performance of State of Art Vision Model.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2023}, +} + +@Article{Cohen2023, + author = {Cohen, Roi and Geva, Mor and Berant, Jonathan and Globerson, Amir}, + date = {2023-01-30}, + title = {Crawling the Internal Knowledge-Base of Language Models}, + doi = {10.48550/ARXIV.2301.12810}, + eprint = {2301.12810}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Cohen2023 - Crawling the Internal Knowledge Base of Language Models.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Liu2023, + author = {Liu, Bin and Wang, Bang}, + date = {2023-01-27}, + title = {Bayesian Self-Supervised Contrastive Learning}, + doi = {10.48550/ARXIV.2301.11673}, + eprint = {2301.11673}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Liu2023 - Bayesian Self Supervised Contrastive Learning.pdf:PDF}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Zhao2021b, + author = {Zhao, Bo and Bilen, Hakan}, + date = {2021-10-08}, + journaltitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision 2023 (WACV)}, + title = {Dataset Condensation with Distribution Matching}, + doi = {10.48550/ARXIV.2110.04181}, + eprint = {2110.04181}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Zhao2021b - Dataset Condensation with Distribution Matching.pdf:PDF}, + groups = {Dataset Distillation Survey, Condensed Dataset}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2021}, +} + +@InProceedings{Jiang2022, + author = {Jiang, Chaoya and Xu, Haiyang and Li, Chenliang and Yan, Ming and Ye, Wei and Zhang, Shikun and Bi, Bin and Huang, Songfang}, + booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing}, + title = {{TRIPS}: Efficient Vision-and-Language Pre-training with Text-Relevant Image Patch Selection}, + pages = {4084--4096}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2022.emnlp-main.273}, + address = {Abu Dhabi, United Arab Emirates}, + file = {:Jiang2022 - TRIPS_ Efficient Vision and Language Pre Training with Text Relevant Image Patch Selection.pdf:PDF}, + month = dec, + year = {2022}, +} + +@InProceedings{Zhuang2023, + author = {Zhuang, Bohan and Liu, Jing and Pan, Zizheng and He, Haoyu and Weng, Yuetian and Shen, Chunhua}, + booktitle = {Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, {IJCAI-23}}, + title = {A Survey on Efficient Training of Transformers}, + doi = {10.24963/ijcai.2023/764}, + editor = {Edith Elkind}, + eprint = {2302.01107}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + note = {Survey Track}, + pages = {6823--6831}, + publisher = {International Joint Conferences on Artificial Intelligence Organization}, + url = {https://doi.org/10.24963/ijcai.2023/764}, + archiveprefix = {arXiv}, + file = {:Zhuang2023 - A Survey on Efficient Training of Transformers.pdf:PDF}, + month = {8}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Liu2023a, + author = {Liu, Hao and Yan, Wilson and Abbeel, Pieter}, + date = {2023-02-02}, + title = {Language Quantized AutoEncoders: Towards Unsupervised Text-Image Alignment}, + doi = {10.48550/ARXIV.2302.00902}, + eprint = {2302.00902}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Liu2023a - Language Quantized AutoEncoders_ Towards Unsupervised Text Image Alignment.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Jiao2023, + author = {Jiao, Jiayu and Tang, Yu-Ming and Lin, Kun-Yu and Gao, Yipeng and Ma, Jinhua and Wang, Yaowei and Zheng, Wei-Shi}, + title = {DilateFormer: Multi-Scale Dilated Transformer for Visual Recognition}, + file = {:Jiao2023 - DilateFormer_ Multi Scale Dilated Transformer for Visual Recognition.pdf:PDF}, + journal = {{IEEE} Transactions on Multimedia}, + year = {2023}, +} + +@Article{Han2022, + author = {Han, Xing and Ren, Tongzheng and Nguyen, Tan Minh and Nguyen, Khai and Ghosh, Joydeep and Ho, Nhat}, + date = {2022-10-11}, + title = {Robustify Transformers with Robust Kernel Density Estimation}, + doi = {10.48550/ARXIV.2210.05794}, + eprint = {2210.05794}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Han2022 - Robustify Transformers with Robust Kernel Density Estimation.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@InProceedings{Strubell2019a, + author = {Strubell, Emma and Ganesh, Ananya and McCallum, Andrew}, + booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, + date = {2019-06-05}, + title = {Energy and Policy Considerations for Deep Learning in NLP}, + doi = {10.18653/v1/P19-1355}, + pages = {3645--3650}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/P19-1355}, + address = {Florence, Italy}, + file = {:Strubell2019a - Energy and Policy Considerations for Deep Learning in NLP.pdf:PDF}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + month = jul, + readstatus = {read}, + year = {2019}, +} + +@InProceedings{Du2022, + author = {Jiawei Du and Zhou Daquan and Jiashi Feng and Vincent Tan and Joey Tianyi Zhou}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {Sharpness-Aware Training for Free}, + editor = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho}, + url = {https://openreview.net/forum?id=xK6wRfL2mv7}, + file = {:Du2022 - Sharpness Aware Training for Free.pdf:PDF}, + priority = {prio2}, + year = {2022}, +} + +@Article{Li2023, + author = {Li, Wenzhe and Luo, Hao and Lin, Zichuan and Zhang, Chongjie and Lu, Zongqing and Ye, Deheng}, + date = {2023-01-08}, + title = {A Survey on Transformers in Reinforcement Learning}, + doi = {10.48550/ARXIV.2301.03044}, + eprint = {2301.03044}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Li2023 - A Survey on Transformers in Reinforcement Learning.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Khan2022, + author = {Salman Khan and Muzammal Naseer and Munawar Hayat and Syed Waqas Zamir and Fahad Shahbaz Khan and Mubarak Shah}, + date = {2022-01}, + title = {Transformers in Vision: A Survey}, + doi = {10.1145/3505244}, + number = {10s}, + pages = {1--41}, + volume = {54}, + file = {:Khan2022 - Transformers in Vision_ a Survey.pdf:PDF}, + journal = {{ACM} Computing Surveys}, + publisher = {Association for Computing Machinery ({ACM})}, + year = {2022}, +} + +@InProceedings{ElNouby2021, + author = {El-Nouby, Alaaeldin and Touvron, Hugo and Caron, Mathilde and Bojanowski, Piotr and Douze, Matthijs and Joulin, Armand and Laptev, Ivan and Neverova, Natalia and Synnaeve, Gabriel and Verbeek, Jakob and Jegou, Hervé}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {XCiT: Cross-Covariance Image Transformers}, + doi = {10.48550/arxiv.2106.09681}, + editor = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan}, + eprint = {2106.09681}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:ElNouby2021 - XCiT_ Cross Covariance Image Transformers.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Liu2022c, + author = {Liu, Ze and Hu, Han and Lin, Yutong and Yao, Zhuliang and Xie, Zhenda and Wei, Yixuan and Ning, Jia and Cao, Yue and Zhang, Zheng and Dong, Li and Wei, Furu and Guo, Baining}, + booktitle = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title = {Swin Transformer V2: Scaling Up Capacity and Resolution}, + doi = {10.48550/ARXIV.2111.09883}, + eprint = {2111.09883}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {11999-12009}, + publisher = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Liu2022c - Swin Transformer V2_ Scaling up Capacity and Resolution.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Pan2023, + author = {Pan, Zizheng and Cai, Jianfei and Zhuang, Bohan}, + date = {2023-02-13}, + title = {Stitchable Neural Networks}, + doi = {10.48550/ARXIV.2302.06586}, + eprint = {2302.06586}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Pan2023 - Stitchable Neural Networks.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Dehghani2023, + author = {Dehghani, Mostafa and Djolonga, Josip and Mustafa, Basil and Padlewski, Piotr and Heek, Jonathan and Gilmer, Justin and Steiner, Andreas and Caron, Mathilde and Geirhos, Robert and Alabdulmohsin, Ibrahim and Jenatton, Rodolphe and Beyer, Lucas and Tschannen, Michael and Arnab, Anurag and Wang, Xiao and Riquelme, Carlos and Minderer, Matthias and Puigcerver, Joan and Evci, Utku and Kumar, Manoj and van Steenkiste, Sjoerd and Elsayed, Gamaleldin F. and Mahendran, Aravindh and Yu, Fisher and Oliver, Avital and Huot, Fantine and Bastings, Jasmijn and Collier, Mark Patrick and Gritsenko, Alexey and Birodkar, Vighnesh and Vasconcelos, Cristina and Tay, Yi and Mensink, Thomas and Kolesnikov, Alexander and Pavetić, Filip and Tran, Dustin and Kipf, Thomas and Lučić, Mario and Zhai, Xiaohua and Keysers, Daniel and Harmsen, Jeremiah and Houlsby, Neil}, + date = {2023-02-10}, + title = {Scaling Vision Transformers to 22 Billion Parameters}, + doi = {10.48550/ARXIV.2302.05442}, + eprint = {2302.05442}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Dehghani2023 - Scaling Vision Transformers to 22 Billion Parameters.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2023}, +} + +@Article{Miyazawa2023, + author = {Kazuki Miyazawa and Takayuki Nagai}, + date = {2023-02}, + title = {Survey on Multimodal Transformers for Robots}, + doi = {10.36227/techrxiv.21993317.v1}, + file = {:Miyazawa2023 - Survey on Multimodal Transformers for Robots.pdf:PDF}, + publisher = {Institute of Electrical and Electronics Engineers ({IEEE})}, +} + +@InProceedings{Peng2021, + author = {Peng, Zhiliang and Huang, Wei and Gu, Shanzhi and Xie, Lingxi and Wang, Yaowei and Jiao, Jianbin and Ye, Qixiang and Z. Peng and W. Huang and S. Gu and L. Xie and Y. Wang and J. Jiao and Q. Ye}, + booktitle = {2021 IEEE/CVF International Conference on Computer Vision (ICCV)}, + date = {2021-05-09}, + title = {Conformer: Local Features Coupling Global Representations for Visual Recognition}, + doi = {10.1109/ICCV48922.2021.00042}, + eprint = {2105.03889}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {357-366}, + publisher = {arXiv}, + address = {Los Alamitos, CA, USA}, + file = {:Peng2021 - Conformer_ Local Features Coupling Global Representations for Visual Recognition.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + month = {oct}, + year = {2021}, +} + +@Misc{Canziani2016, + author = {Canziani, Alfredo and Paszke, Adam and Culurciello, Eugenio}, + date = {2016-05-24}, + title = {An Analysis of Deep Neural Network Models for Practical Applications}, + doi = {10.48550/ARXIV.1605.07678}, + eprint = {1605.07678}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + archiveprefix = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Canziani2016 - An Analysis of Deep Neural Network Models for Practical Applications.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2016}, +} + +@InProceedings{Liu2021, + author = {Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining}, + booktitle = {2021 IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {Swin Transformer: Hierarchical Vision Transformer using Shifted Windows}, + doi = {10.1109/ICCV48922.2021.00986}, + eprint = {2103.14030}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {9992-10002}, + publisher = {IEEE Computer Society}, + address = {Los Alamitos, CA, USA}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Liu2021 - Swin Transformer_ Hierarchical Vision Transformer Using Shifted Windows.pdf:PDF}, + month = {10}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Su2023, + author = {Su, Tong and Song, Chengqun and Cheng, Jun}, + booktitle = {Advances in Guidance, Navigation and Control}, + title = {Vision Transformer with Information Bottleneck for Fine-Grained Visual Classification}, + doi = {10.1007/978-981-19-6613-2_391}, + editor = {Yan, Liang and Duan, Haibin and Deng, Yimin}, + isbn = {978-981-19-6613-2}, + pages = {4010--4019}, + publisher = {Springer Nature Singapore}, + address = {Singapore}, + year = {2023}, +} + +@Article{Shinoda2023, + author = {Risa Shinoda and Hirokatsu Kataoka and Kensho Hara and Ryozo Noguchi}, + date = {2023-08}, + journaltitle = {Smart Agricultural Technology}, + title = {Transformer-based ripeness segmentation for tomatoes}, + doi = {10.1016/j.atech.2023.100196}, + issn = {2772-3755}, + pages = {100196}, + url = {https://www.sciencedirect.com/science/article/pii/S2772375523000266}, + volume = {4}, + publisher = {Elsevier {BV}}, + year = {2023}, +} + +@Article{Yao2023, + author = {Dazhi Yao and Yunxue Shao}, + date = {2023-02}, + journaltitle = {Signal, Image and Video Processing}, + title = {A hierarchical and data-efficient network based on patch-based representation}, + doi = {10.1007/s11760-023-02488-0}, + file = {:Yao2023 - A Hierarchical and Data Efficient Network Based on Patch Based Representation.pdf:PDF}, + publisher = {Springer Science and Business Media {LLC}}, +} + +@Article{Chen2023, + author = {Chen, Xiangning and Liang, Chen and Huang, Da and Real, Esteban and Wang, Kaiyuan and Liu, Yao and Pham, Hieu and Dong, Xuanyi and Luong, Thang and Hsieh, Cho-Jui and Lu, Yifeng and Le, Quoc V.}, + date = {2023-02-13}, + title = {Symbolic Discovery of Optimization Algorithms}, + doi = {10.48550/ARXIV.2302.06675}, + eprint = {2302.06675}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + url = {https://github.com/google/automl/tree/master/lion}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Chen2023 - Symbolic Discovery of Optimization Algorithms.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Raissi2023, + author = {Raissi, Maziar}, + date = {2023-01-26}, + title = {Open Problems in Applied Deep Learning}, + doi = {10.48550/ARXIV.2301.11316}, + eprint = {2301.11316}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Raissi2023 - Open Problems in Applied Deep Learning.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), Human-Computer Interaction (cs.HC), Information Retrieval (cs.IR), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Misc{Patro2023, + author = {Patro, Badri N. and Agneeswaran, Vijay Srinivas}, + date = {2023-02-16}, + title = {Efficiency 360: Efficient Vision Transformers}, + doi = {10.48550/ARXIV.2302.08374}, + eprint = {2302.08374}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + archiveprefix = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Patro2023 - Efficiency 360_ Efficient Vision Transformers.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2023}, +} + +@Article{Nag2023, + author = {Nag, Shashank and Datta, Gourav and Kundu, Souvik and Chandrachoodan, Nitin and Beerel, Peter A.}, + date = {2023-02-17}, + title = {ViTA: A Vision Transformer Inference Accelerator for Edge Applications}, + doi = {10.48550/ARXIV.2302.09108}, + eprint = {2302.09108}, + eprintclass = {cs.AR}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Nag2023 - ViTA_ a Vision Transformer Inference Accelerator for Edge Applications.pdf:PDF}, + keywords = {Hardware Architecture (cs.AR), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Caron2021, + author = {Caron, Mathilde and Touvron, Hugo and Misra, Ishan and Jégou, Hervé and Mairal, Julien and Bojanowski, Piotr and Joulin, Armand}, + booktitle = {Proceedings of the International Conference on Computer Vision (ICCV)}, + date = {2021-04-29}, + title = {Emerging Properties in Self-Supervised Vision Transformers}, + doi = {10.48550/ARXIV.2104.14294}, + eprint = {2104.14294}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + publisher = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Caron2021 - Emerging Properties in Self Supervised Vision Transformers.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2021}, +} + +@InProceedings{Graham2021, + author = {Graham, Ben and El-Nouby, Alaaeldin and Touvron, Hugo and Stock, Pierre and Joulin, Armand and Jégou, Hervé and Douze, Matthijs}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {LeViT: a Vision Transformer in ConvNet's Clothing for Faster Inference}, + doi = {10.48550/arxiv.2104.01136}, + eprint = {2104.01136}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {12259-12269}, + publisher = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Graham2021 - LeViT_ a Vision Transformer in ConvNet's Clothing for Faster Inference.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + month = {October}, + year = {2021}, +} + +@InProceedings{Recht2019, + author = {Recht, Benjamin and Roelofs, Rebecca and Schmidt, Ludwig and Shankar, Vaishaal}, + booktitle = {International conference on machine learning}, + title = {Do ImageNet Classifiers Generalize to ImageNet?}, + doi = {10.48550/arxiv.1902.10811}, + eprint = {1902.10811}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + organization = {PMLR}, + pages = {5389--5400}, + file = {:Recht2019 - Do ImageNet Classifiers Generalize to ImageNet_.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + readstatus = {skimmed}, + year = {2019}, +} + +@Article{Manzari2023, + author = {Manzari, Omid Nejati and Ahmadabadi, Hamid and Kashiani, Hossein and Shokouhi, Shahriar B. and Ayatollahi, Ahmad}, + date = {2023-02-19}, + title = {MedViT: A Robust Vision Transformer for Generalized Medical Image Classification}, + doi = {10.48550/ARXIV.2302.09462}, + eprint = {2302.09462}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Manzari2023 - MedViT_ a Robust Vision Transformer for Generalized Medical Image Classification.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Luo2021, + author = {Luo, Shengjie and Li, Shanda and Cai, Tianle and He, Di and Peng, Dinglan and Zheng, Shuxin and Ke, Guolin and Wang, Liwei and Liu, Tie-Yan}, + booktitle = {Advances in Neural Information Processing Systems}, + date = {2021-06-23}, + title = {Stable, Fast and Accurate: Kernelized Attention with Relative Positional Encoding}, + doi = {10.48550/arxiv.2106.12566}, + editor = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan}, + eprint = {2106.12566}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=X7XNPor93uG}, + file = {:Luo2021 - Stable, Fast and Accurate_ Kernelized Attention with Relative Positional Encoding.pdf:PDF}, + year = {2021}, +} + +@Article{Agrawal2023, + author = {Agrawal, Siddharth}, + date = {2023-02-02}, + title = {Scaling Up Computer Vision Neural Networks Using Fast Fourier Transform}, + doi = {10.48550/ARXIV.2302.12185}, + eprint = {2302.12185}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Agrawal2023 - Scaling up Computer Vision Neural Networks Using Fast Fourier Transform.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Chen2021, + author = {Chen, Beidi and Dao, Tri and Winsor, Eric and Song, Zhao and Rudra, Atri and Ré, Christopher}, + booktitle = {Advances in Neural Information Processing Systems}, + date = {2021-10-28}, + title = {Scatterbrain: Unifying Sparse and Low-rank Attention Approximation}, + doi = {10.48550/arxiv.2110.15343}, + editor = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan}, + eprint = {2110.15343}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=SehIKudiIo1}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Chen2021 - Scatterbrain_ Unifying Sparse and Low Rank Attention Approximation.pdf:PDF}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Desislavov2023, + author = {Radosvet Desislavov and Fernando Mart{\'{\i}}nez-Plumed and Jos{\'{e}} Hern{\'{a}}ndez-Orallo}, + date = {2023-02}, + journaltitle = {Sustainable Computing: Informatics and Systems}, + title = {Trends in {AI} inference energy consumption: Beyond the performance-vs-parameter laws of deep learning}, + doi = {10.1016/j.suscom.2023.100857}, + pages = {100857}, + file = {:Desislavov2023 - Trends in AI Inference Energy Consumption_ beyond the Performance Vs Parameter Laws of Deep Learning.pdf:PDF}, + publisher = {Elsevier {BV}}, +} + +@InProceedings{Xiong2021, + author = {Xiong, Yunyang and Zeng, Zhanpeng and Chakraborty, Rudrasis and Tan, Mingxing and Fung, Glenn and Li, Yin and Singh, Vikas}, + booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence}, + title = {Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention}, + doi = {10.48550/arxiv.2102.03902}, + eprint = {2102.03902}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + file = {:Xiong2021 - Nyströmformer_ a Nyström Based Algorithm for Approximating Self Attention.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Koh2023, + author = {Koh, Jing Yu and Salakhutdinov, Ruslan and Fried, Daniel}, + date = {2023-01-31}, + title = {Grounding Language Models to Images for Multimodal Generation}, + doi = {10.48550/ARXIV.2301.13823}, + eprint = {2301.13823}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Koh2023 - Grounding Language Models to Images for Multimodal Generation.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Zhai2022, + author = {Zhai, Xiaohua and Kolesnikov, Alexander and Houlsby, Neil and Beyer, Lucas}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + title = {Scaling Vision Transformers}, + doi = {10.48550/arxiv.2106.04560}, + eprint = {2106.04560}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {12104--12113}, + file = {:Zhai2022 - Scaling Vision Transformers.pdf:PDF}, + year = {2022}, +} + +@Article{Yu2022, + author = {Yu, Jiahui and Wang, Zirui and Vasudevan, Vijay and Yeung, Legg and Seyedhosseini, Mojtaba and Wu, Yonghui}, + title = {CoCa: Contrastive Captioners are Image-Text Foundation Models}, + doi = {10.48550/arxiv.2205.01917}, + eprint = {2205.01917}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + issn = {2835-8856}, + url = {https://openreview.net/forum?id=Ee277P3AYC}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Yu2022 - CoCa_ Contrastive Captioners Are Image Text Foundation Models.pdf:PDF}, + journal = {Transactions on Machine Learning Research}, + year = {2022}, +} + +@InProceedings{Brown2020, + author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {Language Models are Few-Shot Learners}, + doi = {10.48550/arxiv.2005.14165}, + editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin}, + eprint = {2005.14165}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + pages = {1877--1901}, + publisher = {Curran Associates, Inc.}, + url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf}, + volume = {33}, + file = {:Brown2020 - Language Models Are Few Shot Learners.pdf:PDF}, + year = {2020}, +} + +@Article{Takashima2023, + author = {Takashima, Sora and Hayamizu, Ryo and Inoue, Nakamasa and Kataoka, Hirokatsu and Yokota, Rio}, + date = {2023-03-02}, + title = {Visual Atoms: Pre-training Vision Transformers with Sinusoidal Waves}, + doi = {10.48550/ARXIV.2303.01112}, + eprint = {2303.01112}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Takashima2023 - Visual Atoms_ Pre Training Vision Transformers with Sinusoidal Waves.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2023}, +} + +@InProceedings{Vaswani2021, + author = {Vaswani, Ashish and Ramachandran, Prajit and Srinivas, Aravind and Parmar, Niki and Hechtman, Blake and Shlens, Jonathon}, + booktitle = {2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title = {Scaling Local Self-Attention for Parameter Efficient Visual Backbones}, + doi = {10.1109/CVPR46437.2021.01270}, + eprint = {2103.12731}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {12889-12899}, + publisher = {IEEE Computer Society}, + address = {Los Alamitos, CA, USA}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Vaswani2021 - Scaling Local Self Attention for Parameter Efficient Visual Backbones.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + month = {6}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Yang2022a, + author = {Yang, Jianwei and Li, Chunyuan and Dai, Xiyang and Yuan, Lu and Gao, Jianfeng}, + booktitle = {Advances in Neural Information Processing Systems}, + date = {2022-03-22}, + title = {Focal Modulation Networks}, + doi = {10.48550/arxiv.2203.11926}, + editor = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho}, + eprint = {2203.11926}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=ePhEbo039l}, + file = {:Yang2022a - Focal Modulation Networks.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Yin2022, + author = {Yin, Hongxu and Vahdat, Arash and Alvarez, Jose M. and Mallya, Arun and Kautz, Jan and Molchanov, Pavlo}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title = {A-ViT: Adaptive Tokens for Efficient Vision Transformer}, + pages = {10809-10818}, + file = {:Yin2022 - A ViT_ Adaptive Tokens for Efficient Vision Transformer.pdf:PDF}, + month = {6}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Meng2022, + author = {Meng, Lingchen and Li, Hengduo and Chen, Bor-Chun and Lan, Shiyi and Wu, Zuxuan and Jiang, Yu-Gang and Lim, Ser-Nam}, + title = {AdaViT: Adaptive Vision Transformers for Efficient Image Recognition}, + doi = {10.48550/arxiv.2111.15668}, + eprint = {2111.15668}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {12299-12308}, + file = {:Meng2022 - AdaViT_ Adaptive Vision Transformers for Efficient Image Recognition.pdf:PDF}, + journal = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + year = {2022}, +} + +@InProceedings{Tay2020, + author = {Tay, Yi and Bahri, Dara and Yang, Liu and Metzler, Donald and Juan, Da-Cheng}, + booktitle = {International Conference on Machine Learning}, + title = {Sparse Sinkhorn Attention}, + doi = {10.48550/ARXIV.2002.11296}, + eprint = {2002.11296}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + organization = {PMLR}, + pages = {9438--9447}, + publisher = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Tay2020 - Sparse Sinkhorn Attention.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@Article{Mehrani2023, + author = {Mehrani, Paria and Tsotsos, John K. and Mehrani, Paria and Tsotsos, John K.}, + date = {2023-03-02}, + title = {Self-attention in vision transformers performs perceptual grouping, not attention}, + doi = {10.3389/fcomp.2023.1178450}, + eprint = {2303.01542}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + issn = {2624-9898}, + volume = {5}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Mehrani2023 - Self Attention in Vision Transformers Performs Perceptual Grouping, Not Attention.pdf:PDF}, + journal = {Frontiers in Computer Science}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Gupta2023, + author = {Gupta, Animesh and Hasan, Irtiza and Prasad, Dilip K. and Gupta, Deepak K.}, + date = {2023-03-03}, + title = {Data-Efficient Training of CNNs and Transformers with Coresets: A Stability Perspective}, + doi = {10.48550/ARXIV.2303.02095}, + eprint = {2303.02095}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Gupta2023 - Data Efficient Training of CNNs and Transformers with Coresets_ a Stability Perspective.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Yu2023, + author = {Yu, Lu and Xiang, Wei}, + date = {2023-03-08}, + title = {X-Pruner: eXplainable Pruning for Vision Transformers}, + doi = {10.48550/ARXIV.2303.04935}, + eprint = {2303.04935}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Yu2023 - X Pruner_ EXplainable Pruning for Vision Transformers.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Chang2023, + author = {Chang, Shuning and Wang, Pichao and Lin, Ming and Wang, Fan and Zhang, David Junhao and Jin, Rong and Shou, Mike Zheng}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title = {Making Vision Transformers Efficient From a Token Sparsification View}, + eprint = {2303.08685}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {6195-6205}, + archiveprefix = {arXiv}, + file = {:Chang2023 - Making Vision Transformers Efficient from a Token Sparsification View.pdf:PDF}, + month = {June}, + year = {2023}, +} + +@Article{Katharopoulos2020, + author = {Katharopoulos, Angelos and Vyas, Apoorv and Pappas, Nikolaos and Fleuret, François}, + title = {Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention}, + doi = {10.48550/arxiv.2006.16236}, + eprint = {2006.16236}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + pages = {5156--5165}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Katharopoulos2020 - Transformers Are RNNs_ Fast Autoregressive Transformers with Linear Attention.pdf:PDF}, + journal = {International Conference on Machine Learning}, + organization = {PMLR}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@InProceedings{Wu2021, + author = {Wu, Haiping and Xiao, Bin and Codella, Noel and Liu, Mengchen and Dai, Xiyang and Yuan, Lu and Zhang, Lei}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision}, + date = {2021-03-29}, + title = {CvT: Introducing Convolutions to Vision Transformers}, + doi = {10.48550/arxiv.2103.15808}, + eprint = {2103.15808}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {22--31}, + file = {:Wu2021 - CvT_ Introducing Convolutions to Vision Transformers.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Nekoozadeh2023, + author = {Nekoozadeh, Anahita and Ahmadzadeh, Mohammad Reza and Mardani, Zahra and Mardani, Morteza}, + date = {2023-03-22}, + title = {Multiscale Attention via Wavelet Neural Operators for Vision Transformers}, + doi = {10.48550/ARXIV.2303.12398}, + eprint = {2303.12398}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Nekoozadeh2023 - Multiscale Attention Via Wavelet Neural Operators for Vision Transformers.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Yao2022, + author = {Yao, Ting and Pan, Yingwei and Li, Yehao and Ngo, Chong-Wah and Mei, Tao}, + booktitle = {Computer Vision -- ECCV 2022}, + title = {Wave-ViT: Unifying Wavelet and Transformers for Visual Representation Learning}, + doi = {10.48550/arxiv.2207.04978}, + editor = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni Maria and Hassner, Tal}, + eprint = {2207.04978}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {328--345}, + publisher = {Springer Nature Switzerland}, + address = {Cham}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Yao2022 - Wave ViT_ Unifying Wavelet and Transformers for Visual Representation Learning.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Li2023d, + author = {Li, Yanyu and Hu, Ju and Wen, Yang and Evangelidis, Georgios and Salahi, Kamyar and Wang, Yanzhi and Tulyakov, Sergey and Ren, Jian}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {Rethinking Vision Transformers for MobileNet Size and Speed}, + eprint = {2212.08059}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {16889-16900}, + archiveprefix = {arXiv}, + comment = {EfficientFormerV2}, + file = {:Li2022a - Rethinking Vision Transformers for MobileNet Size and Speed.pdf:PDF}, + month = {October}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Roy2021, + author = {Roy, Aurko and Saffar, Mohammad and Vaswani, Ashish and Grangier, David}, + title = {Efficient Content-Based Sparse Attention with Routing Transformers}, + doi = {10.1162/tacl_a_00353}, + eprint = {2003.05997}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + pages = {53--68}, + url = {https://aclanthology.org/2021.tacl-1.4}, + volume = {9}, + address = {Cambridge, MA}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Roy2021 - Efficient Content Based Sparse Attention with Routing Transformers.pdf:PDF}, + journal = {Transactions of the Association for Computational Linguistics}, + publisher = {MIT Press}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Tay2021, + author = {Tay, Yi and Bahri, Dara and Metzler, Donald and Juan, Da-Cheng and Zhao, Zhe and Zheng, Che and Tay, Yi and Bahri, Dara and Metzler, Donald and Juan, Da-Cheng and Zhao, Zhe and Zheng, Che}, + booktitle = {International conference on machine learning}, + title = {Synthesizer: Rethinking Self-Attention in Transformer Models}, + doi = {10.48550/arxiv.2005.00743}, + eprint = {2005.00743}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + organization = {PMLR}, + pages = {10183--10192}, + file = {:Tay2021 - Synthesizer_ Rethinking Self Attention in Transformer Models.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Fedus2022, + author = {Fedus, William and Zoph, Barret and Shazeer, Noam}, + title = {Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity}, + doi = {10.48550/ARXIV.2101.03961}, + eprint = {2101.03961}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + number = {1}, + pages = {5232--5270}, + volume = {23}, + file = {:Fedus2022 - Switch Transformers_ Scaling to Trillion Parameter Models with Simple and Efficient Sparsity.pdf:PDF}, + journal = {The Journal of Machine Learning Research}, + publisher = {JMLRORG}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Ryoo2021, + author = {Ryoo, Michael S. and Piergiovanni, AJ and Arnab, Anurag and Dehghani, Mostafa and Angelova, Anelia}, + date = {2021-06-21}, + title = {TokenLearner: What Can 8 Learned Tokens Do for Images and Videos?}, + doi = {10.48550/ARXIV.2106.11297}, + eprint = {2106.11297}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + file = {:Ryoo2021 - TokenLearner_ What Can 8 Learned Tokens Do for Images and Videos_.pdf:PDF}, + journal = {NeurIPS 2021}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Choromanski2024, + author = {Choromanski, Krzysztof Marcin and Li, Shanda and Likhosherstov, Valerii and Dubey, Kumar Avinava and Luo, Shengjie and He, Di and Yang, Yiming and Sarlos, Tamas and Weingarten, Thomas and Weller, Adrian}, + booktitle = {AISTATS 2024}, + title = {Learning a Fourier Transform for Linear Relative Positional Encodings in Transformers}, + eprint = {2302.01925}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + archiveprefix = {arXiv}, + file = {:Choromanski2023 - Learning a Fourier Transform for Linear Relative Positional Encodings in Transformers.pdf:PDF}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@InProceedings{Bolya2023, + author = {Bolya, Daniel and Fu, Cheng-Yang and Dai, Xiaoliang and Zhang, Peizhao and Feichtenhofer, Christoph and Hoffman, Judy}, + booktitle = {International Conference on Learning Representations}, + title = {Token Merging: Your ViT But Faster}, + doi = {10.48550/arxiv.2210.09461}, + eprint = {2210.09461}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + file = {:Bolya2023 - Token Merging_ Your ViT but Faster.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@InProceedings{Xu2021, + author = {Xu, Weijian and Xu, Yifan and Chang, Tyler and Tu, Zhuowen}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision}, + title = {Co-Scale Conv-Attentional Image Transformers}, + doi = {10.48550/arxiv.2104.06399}, + eprint = {2104.06399}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {9981--9990}, + file = {:Xu2021 - Co Scale Conv Attentional Image Transformers.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Ebert2023, + author = {Ebert, Nikolas and Stricker, Didier and Wasenm{\"u}ller, Oliver}, + title = {PLG-ViT: Vision Transformer with Parallel Local and Global Self-Attention}, + number = {7}, + pages = {3447}, + volume = {23}, + file = {:Ebert2023 - PLG ViT_ Vision Transformer with Parallel Local and Global Self Attention.pdf:PDF}, + journal = {Sensors}, + publisher = {MDPI}, + year = {2023}, +} + +@Article{Zhang2023, + author = {Zhang, Qiming and Zhang, Jing and Xu, Yufei and Tao, Dacheng}, + date = {2023-03-27}, + title = {Vision Transformer with Quadrangle Attention}, + doi = {10.48550/ARXIV.2303.15105}, + eprint = {2303.15105}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhang2023 - Vision Transformer with Quadrangle Attention.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Ronen2023, + author = {Ronen, Tomer and Levy, Omer and Golbert, Avram}, + date = {2023-04-01}, + title = {Vision Transformers with Mixed-Resolution Tokenization}, + doi = {10.48550/ARXIV.2304.00287}, + eprint = {2304.00287}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Ronen2023 - Vision Transformers with Mixed Resolution Tokenization.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Dai2019, + author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan}, + booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, + title = {Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context}, + doi = {10.18653/v1/P19-1285}, + eprint = {1901.02860}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + pages = {2978--2988}, + publisher = {Association for Computational Linguistics}, + address = {Florence, Italy}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Dai2019 - Transformer XL_ Attentive Language Models beyond a Fixed Length Context.pdf:PDF}, + month = jul, + year = {2019}, +} + +@Article{Togelius2023, + author = {Togelius, Julian and Yannakakis, Georgios N.}, + date = {2023-03-31}, + title = {Choose Your Weapon: Survival Strategies for Depressed AI Academics}, + doi = {10.48550/ARXIV.2304.06035}, + eprint = {2304.06035}, + eprintclass = {cs.OH}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Togelius2023 - Choose Your Weapon_ Survival Strategies for Depressed AI Academics.pdf:PDF}, + keywords = {Other Computer Science (cs.OH), Computers and Society (cs.CY), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences}, + publisher = {arXiv}, + readstatus = {skimmed}, + year = {2023}, +} + +@InProceedings{Kim2021, + author = {Kyungmin Kim and Bichen Wu and Xiaoliang Dai and Peizhao Zhang and Zhicheng Yan and Peter Vajda and Seon Kim}, + booktitle = {2021 {IEEE}/{CVF} Conference on Computer Vision and Pattern Recognition Workshops ({CVPRW})}, + date = {2021-06}, + title = {Rethinking the Self-Attention in Vision Transformers}, + doi = {10.1109/cvprw53098.2021.00342}, + publisher = {{IEEE}}, + file = {:Kim2021 - Rethinking the Self Attention in Vision Transformers.pdf:PDF}, + year = {2021}, +} + +@Misc{Islam2022, + author = {Islam, Khawar}, + date = {2022-03-03}, + title = {Recent Advances in Vision Transformer: A Survey and Outlook of Recent Work}, + doi = {10.48550/ARXIV.2203.01536}, + eprint = {2203.01536}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + archiveprefix = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Islam2022 - Recent Advances in Vision Transformer_ a Survey and Outlook of Recent Work.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Han2023, + author = {K. Han and Y. Wang and H. Chen and X. Chen and J. Guo and Z. Liu and Y. Tang and A. Xiao and C. Xu and Y. Xu and Z. Yang and Y. Zhang and D. Tao}, + title = {A Survey on Vision Transformer}, + doi = {10.1109/TPAMI.2022.3152247}, + issn = {1939-3539}, + number = {01}, + pages = {87-110}, + volume = {45}, + address = {Los Alamitos, CA, USA}, + file = {:Han2023 - A Survey on Vision Transformer.pdf:PDF}, + journal = {IEEE Transactions on Pattern Analysis \& Machine Intelligence}, + keywords = {transformers;task analysis;encoding;computer vision;computational modeling;visualization;object detection}, + month = {1}, + publisher = {IEEE Computer Society}, + year = {2023}, +} + +@InProceedings{Tay2021a, + author = {Tay, Yi and Dehghani, Mostafa and Abnar, Samira and Shen, Yikang and Bahri, Dara and Pham, Philip and Rao, Jinfeng and Yang, Liu and Ruder, Sebastian and Metzler, Donald}, + booktitle = {International Conference on Learning Representations}, + title = {Long Range Arena: A Benchmark for Efficient Transformers}, + doi = {10.48550/arxiv.2011.04006}, + eprint = {2011.04006}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Tay2021a - Long Range Arena_ a Benchmark for Efficient Transformers.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Ranftl2021, + author = {Ranftl, Ren\'e and Bochkovskiy, Alexey and Koltun, Vladlen}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {Vision Transformers for Dense Prediction}, + pages = {12179-12188}, + url = {https://openaccess.thecvf.com/content/ICCV2021/html/Ranftl_Vision_Transformers_for_Dense_Prediction_ICCV_2021_paper.html}, + file = {:Ranftl2021 - Vision Transformers for Dense Prediction.pdf:PDF}, + month = {October}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Park2022, + author = {Park, Namuk and Kim, Songkuk}, + booktitle = {International Conference on Learning Representations}, + title = {How Do Vision Transformers Work?}, + doi = {10.48550/arxiv.2202.06709}, + eprint = {2202.06709}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=D78Go4hVcxO}, + file = {:Park2022 - How Do Vision Transformers Work_.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Lin2022, + author = {Tianyang Lin and Yuxin Wang and Xiangyang Liu and Xipeng Qiu}, + date = {2022}, + journaltitle = {{AI} Open}, + title = {A survey of transformers}, + doi = {10.1016/j.aiopen.2022.10.001}, + issn = {2666-6510}, + pages = {111--132}, + volume = {3}, + file = {:Lin2022 - A Survey of Transformers.pdf:PDF}, + journal = {AI Open}, + publisher = {Elsevier {BV}}, + year = {2022}, +} + +@Article{Liu2023b, + author = {Liu, Yang and Zhang, Yao and Wang, Yixin and Hou, Feng and Yuan, Jin and Tian, Jiang and Zhang, Yang and Shi, Zhongchao and Fan, Jianping and He, Zhiqiang}, + title = {A Survey of Visual Transformers}, + doi = {10.1109/TNNLS.2022.3227717}, + eprint = {2111.06091}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {1-21}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Liu2023b - A Survey of Visual Transformers.pdf:PDF}, + journal = {IEEE Transactions on Neural Networks and Learning Systems}, + publisher = {IEEE}, + year = {2023}, +} + +@Article{Xu2021a, + author = {Yifan Xu and Huapeng Wei and Minxuan Lin and Yingying Deng and Kekai Sheng and Mengdan Zhang and Fan Tang and Weiming Dong and Feiyue Huang and Changsheng Xu}, + date = {2021-10}, + title = {Transformers in computational visual media: A survey}, + doi = {10.1007/s41095-021-0247-3}, + number = {1}, + pages = {33--62}, + volume = {8}, + file = {:Xu2021a - Transformers in Computational Visual Media_ a Survey.pdf:PDF}, + journal = {Computational Visual Media}, + publisher = {Springer Science and Business Media {LLC}}, + year = {2021}, +} + +@Article{Shamshad2023, + author = {Fahad Shamshad and Salman Khan and Syed Waqas Zamir and Muhammad Haris Khan and Munawar Hayat and Fahad Shahbaz Khan and Huazhu Fu}, + date = {2023-04}, + title = {Transformers in medical imaging: A survey}, + doi = {10.1016/j.media.2023.102802}, + pages = {102802}, + file = {:Shamshad2023 - Transformers in Medical Imaging_ a Survey.pdf:PDF}, + journal = {Medical Image Analysis}, + publisher = {Elsevier {BV}}, + year = {2023}, +} + +@Article{Aleissaee2023, + author = {Abdulaziz Amer Aleissaee and Amandeep Kumar and Rao Muhammad Anwer and Salman Khan and Hisham Cholakkal and Gui-Song Xia and Fahad Shahbaz Khan}, + date = {2023-03}, + title = {Transformers in Remote Sensing: A Survey}, + doi = {10.3390/rs15071860}, + number = {7}, + pages = {1860}, + volume = {15}, + file = {:Aleissaee2023 - Transformers in Remote Sensing_ a Survey.pdf:PDF}, + journal = {Remote Sensing}, + publisher = {{MDPI} {AG}}, + year = {2023}, +} + +@Article{Selva2023, + author = {Selva, Javier and Johansen, Anders S. and Escalera, Sergio and Nasrollahi, Kamal and Moeslund, Thomas B. and Clapés, Albert}, + title = {Video Transformers: A Survey}, + doi = {10.1109/TPAMI.2023.3243465}, + pages = {1-20}, + file = {:Selva2023 - Video Transformers_ a Survey.pdf:PDF}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + year = {2023}, +} + +@Article{Li2023b, + author = {Jun Li and Junyu Chen and Yucheng Tang and Ce Wang and Bennett A. Landman and S. Kevin Zhou}, + date = {2023-04}, + title = {Transforming medical imaging with Transformers? A comparative review of key properties, current progresses, and future perspectives}, + doi = {10.1016/j.media.2023.102762}, + pages = {102762}, + volume = {85}, + file = {:Li2023b - Transforming Medical Imaging with Transformers_ a Comparative Review of Key Properties, Current Progresses, and Future Perspectives.pdf:PDF}, + journal = {Medical Image Analysis}, + publisher = {Elsevier {BV}}, + year = {2023}, +} + +@Article{He2023, + author = {Kelei He and Chen Gan and Zhuoyuan Li and Islem Rekik and Zihao Yin and Wen Ji and Yang Gao and Qian Wang and Junfeng Zhang and Dinggang Shen}, + date = {2023-02}, + title = {Transformers in medical image analysis}, + doi = {10.1016/j.imed.2022.07.002}, + number = {1}, + pages = {59--78}, + volume = {3}, + file = {:He2023 - Transformers in Medical Image Analysis.pdf:PDF}, + journal = {Intelligent Medicine}, + publisher = {Elsevier {BV}}, + year = {2023}, +} + +@Misc{Yang2022b, + author = {Yang, Yuting and Jiao, Licheng and Liu, Xu and Liu, Fang and Yang, Shuyuan and Feng, Zhixi and Tang, Xu}, + date = {2022-03-24}, + title = {Transformers Meet Visual Learning Understanding: A Comprehensive Review}, + doi = {10.48550/ARXIV.2203.12944}, + eprint = {2203.12944}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + archiveprefix = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Yang2022b - Transformers Meet Visual Learning Understanding_ a Comprehensive Review.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Misc{Latif2023, + author = {Latif, Siddique and Zaidi, Aun and Cuayahuitl, Heriberto and Shamshad, Fahad and Shoukat, Moazzam and Qadir, Junaid}, + date = {2023-03-21}, + title = {Transformers in Speech Processing: A Survey}, + doi = {10.48550/ARXIV.2303.11607}, + eprint = {2303.11607}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + archiveprefix = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Latif2023 - Transformers in Speech Processing_ a Survey.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Casola2022, + author = {Silvia Casola and Ivano Lauriola and Alberto Lavelli}, + date = {2022-09}, + title = {Pre-trained transformers: an empirical comparison}, + doi = {10.1016/j.mlwa.2022.100334}, + pages = {100334}, + volume = {9}, + file = {:Casola2020 - Pre Trained Transformers_ an Empirical Comparison.pdf:PDF}, + journal = {Machine Learning with Applications}, + publisher = {Elsevier {BV}}, + year = {2022}, +} + +@Misc{Ulhaq2022, + author = {Ulhaq, Anwaar and Akhtar, Naveed and Pogrebna, Ganna and Mian, Ajmal}, + date = {2022-09-13}, + title = {Vision Transformers for Action Recognition: A Survey}, + doi = {10.48550/ARXIV.2209.05700}, + eprint = {2209.05700}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + archiveprefix = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Ulhaq2022 - Vision Transformers for Action Recognition_ a Survey.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Ali2023, + author = {Ali, Anas M. and Benjdira, Bilel and Koubaa, Anis and El-Shafai, Walid and Khan, Zahid and Boulila, Wadii}, + title = {Vision Transformers in Image Restoration: A Survey}, + doi = {10.3390/s23052385}, + issn = {1424-8220}, + number = {5}, + url = {https://www.mdpi.com/1424-8220/23/5/2385}, + volume = {23}, + article-number = {2385}, + file = {:Ali2023 - Vision Transformers in Image Restoration_ a Survey.pdf:PDF}, + journal = {Sensors}, + pubmedid = {36904589}, + year = {2023}, +} + +@Article{Parvaiz2023, + author = {Arshi Parvaiz and Muhammad Anwaar Khalid and Rukhsana Zafar and Huma Ameer and Muhammad Ali and Muhammad Moazam Fraz}, + date = {2023-06}, + title = {Vision Transformers in medical computer vision{\textemdash}A contemplative retrospection}, + doi = {10.1016/j.engappai.2023.106126}, + pages = {106126}, + volume = {122}, + file = {:Parvaiz2023 - Vision Transformers in Medical Computer Vision_A Contemplative Retrospection.pdf:PDF}, + journal = {Engineering Applications of Artificial Intelligence}, + publisher = {Elsevier {BV}}, + year = {2023}, +} + +@Article{Wen2022, + author = {Wen, Qingsong and Zhou, Tian and Zhang, Chaoli and Chen, Weiqi and Ma, Ziqing and Yan, Junchi and Sun, Liang}, + date = {2022-02-15}, + title = {Transformers in Time Series: A Survey}, + doi = {10.48550/ARXIV.2202.07125}, + eprint = {2202.07125}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Wen2022 - Transformers in Time Series_ a Survey.pdf:PDF}, + journal = {In the 32nd International Joint Conference on Artificial Intelligence (IJCAI 2023)}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Signal Processing (eess.SP), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + publisher = {arXiv}, + year = {2022}, +} + +@InProceedings{Li2023a, + author = {Li, Wei and Xie, Jiahao and Loy, Chen Change}, + booktitle = {The Eleventh International Conference on Learning Representations}, + title = {Correlational Image Modeling for Self-Supervised Visual Pre-Training}, + doi = {10.48550/arxiv.2303.12670}, + eprint = {2303.12670}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=09hVcSDkea}, + file = {:Li2023a - Correlational Image Modeling for Self Supervised Visual Pre Training.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Yeh2023, + author = {Yeh, Catherine and Chen, Yida and Wu, Aoyu and Chen, Cynthia and Viégas, Fernanda and Wattenberg, Martin}, + date = {2023-05-04}, + title = {AttentionViz: A Global View of Transformer Attention}, + doi = {10.48550/ARXIV.2305.03210}, + eprint = {2305.03210}, + eprintclass = {cs.HC}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Yeh2023 - AttentionViz_ a Global View of Transformer Attention.pdf:PDF}, + keywords = {Human-Computer Interaction (cs.HC), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2023}, +} + +@InProceedings{Keles2023, + author = {Keles, Feyza Duman and Wijewardena, Pruthuvi Mahesakya and Hegde, Chinmay and Keles, Feyza Duman and Wijewardena, Pruthuvi Mahesakya and Hegde, Chinmay}, + booktitle = {International Conference on Algorithmic Learning Theory}, + title = {On The Computational Complexity of Self-Attention}, + doi = {10.48550/arxiv.2209.04881}, + eprint = {2209.04881}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + organization = {PMLR}, + pages = {597--619}, + file = {:Keles2023 - On the Computational Complexity of Self Attention.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Alabdulmohsin2023, + author = {Alabdulmohsin, Ibrahim and Zhai, Xiaohua and Kolesnikov, Alexander and Beyer, Lucas}, + date = {2023-05-22}, + title = {Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design}, + doi = {10.48550/ARXIV.2305.13035}, + eprint = {2305.13035}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Alabdulmohsin2023 - Getting ViT in Shape_ Scaling Laws for Compute Optimal Model Design.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences, I.2.10; I.2.6}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Abbas2023, + author = {Abbas, Amro and Tirumala, Kushal and Simig, Dániel and Ganguli, Surya and Morcos, Ari S.}, + date = {2023-03-16}, + title = {SemDeDup: Data-efficient learning at web-scale through semantic deduplication}, + doi = {10.48550/ARXIV.2303.09540}, + eprint = {2303.09540}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Abbas2023 - SemDeDup_ Data Efficient Learning at Web Scale through Semantic Deduplication.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Brock2021, + author = {Brock, Andrew and De, Soham and Smith, Samuel L. and Simonyan, Karen}, + booktitle = {International Conference on Machine Learning}, + title = {High-Performance Large-Scale Image Recognition Without Normalization}, + doi = {10.48550/arxiv.2102.06171}, + eprint = {2102.06171}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + organization = {PMLR}, + pages = {1059--1071}, + file = {:Brock2021 - High Performance Large Scale Image Recognition without Normalization.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Scribano2023, + author = {Carmelo Scribano and Giorgia Franchini and Marco Prato and Marko Bertogna}, + date = {2023-02}, + journaltitle = {Journal of Scientific Computing}, + title = {{DCT}-Former: Efficient Self-Attention with Discrete Cosine Transform}, + doi = {10.1007/s10915-023-02125-5}, + number = {3}, + volume = {94}, + file = {:Scribano2023 - DCT Former_ Efficient Self Attention with Discrete Cosine Transform.pdf:PDF}, + publisher = {Springer Science and Business Media {LLC}}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, +} + +@Article{Zhang2022a, + author = {Zhang, Lei and Zhang, Jie and Lei, Bowen and Mukherjee, Subhabrata and Pan, Xiang and Zhao, Bo and Ding, Caiwen and Li, Yao and Xu, Dongkuan}, + date = {2022-12-12}, + title = {Accelerating Dataset Distillation via Model Augmentation}, + doi = {10.48550/ARXIV.2212.06152}, + eprint = {2212.06152}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhang2022a - Accelerating Dataset Distillation Via Model Augmentation.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@InProceedings{Davari2022, + author = {MohammadReza Davari and Stefan Horoi and Amine Natik and Guillaume Lajoie and Guy Wolf and Eugene Belilovsky}, + booktitle = {NeurIPS ML Safety Workshop}, + title = {Deceiving the {CKA} Similarity Measure in Deep Learning}, + url = {https://openreview.net/forum?id=hITONWhDIIJ}, + file = {:Davari2022 - Deceiving the CKA Similarity Measure in Deep Learning.pdf:PDF}, + year = {2022}, +} + +@InProceedings{Kolesnikov2020, + author = {Kolesnikov, Alexander and Beyer, Lucas and Zhai, Xiaohua and Puigcerver, Joan and Yung, Jessica and Gelly, Sylvain and Houlsby, Neil}, + booktitle = {Computer Vision -- ECCV 2020}, + title = {Big Transfer (BiT): General Visual Representation Learning}, + doi = {10.48550/arxiv.1912.11370}, + eprint = {1912.11370}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + isbn = {978-3-030-58558-7}, + pages = {491--507}, + publisher = {Springer International Publishing}, + address = {Cham}, + file = {:Kolesnikov2020 - Big Transfer (BiT)_ General Visual Representation Learning.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2020}, +} + +@InProceedings{He2016, + author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, + title = {Deep Residual Learning for Image Recognition}, + doi = {10.48550/arxiv.1512.03385}, + eprint = {1512.03385}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {770--778}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:He2016 - Deep Residual Learning for Image Recognition.pdf:PDF}, + year = {2016}, +} + +@InProceedings{Martins2022, + author = {Martins, Pedro Henrique and Marinho, Zita and Martins, André F. T.}, + booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + title = {$\infty$-former: Infinite Memory Transformer}, + doi = {10.18653/v1/2022.acl-long.375}, + eprint = {2109.00301}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + pages = {5468--5485}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2022.acl-long.375}, + address = {Dublin, Ireland}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Martins2022 - $$ Former_ Infinite Memory Transformer.pdf:PDF}, + month = may, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Misc{Mukherjee2023, + author = {Subhabrata Mukherjee and Arindam Mitra and Ganesh Jawahar and Sahaj Agarwal and Hamid Palangi and Ahmed Awadallah}, + title = {Orca: Progressive Learning from Complex Explanation Traces of GPT-4}, + doi = {10.48550/arxiv.2306.02707}, + eprint = {2306.02707}, + archiveprefix = {arXiv}, + file = {:Mukherjee2023 - Orca_ Progressive Learning from Complex Explanation Traces of GPT 4.pdf:PDF}, + primaryclass = {cs.CL}, + year = {2023}, +} + +@InProceedings{You2020, + author = {You, Yang and Li, Jing and Reddi, Sashank and Hseu, Jonathan and Kumar, Sanjiv and Bhojanapalli, Srinadh and Song, Xiaodan and Demmel, James and Keutzer, Kurt and Hsieh, Cho-Jui}, + booktitle = {International Conference on Learning Representations}, + title = {Large Batch Optimization for Deep Learning: Training BERT in 76 minutes}, + doi = {10.48550/arxiv.1904.00962}, + eprint = {1904.00962}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=Syx4wnEtvH}, + file = {:You2020 - Large Batch Optimization for Deep Learning_ Training BERT in 76 Minutes.pdf:PDF}, + year = {2020}, +} + +@Article{Diwan2023, + author = {Diwan, Anuj and Choi, Eunsol and Harwath, David}, + date = {2023-06-14}, + title = {When to Use Efficient Self Attention? Profiling Text, Speech and Image Transformer Variants}, + doi = {10.48550/ARXIV.2306.08667}, + eprint = {2306.08667}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Diwan2023 - When to Use Efficient Self Attention_ Profiling Text, Speech and Image Transformer Variants.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Kurtz2023, + author = {Kurtz, Yoav and Bar, Noga and Giryes, Raja}, + date = {2023-06-16}, + title = {Group Orthogonalization Regularization For Vision Models Adaptation and Robustness}, + doi = {10.48550/ARXIV.2306.10001}, + eprint = {2306.10001}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Kurtz2023 - Group Orthogonalization Regularization for Vision Models Adaptation and Robustness.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Zhang2023a, + author = {Zhang, Jifan and Chen, Yifang and Canal, Gregory and Mussmann, Stephen and Zhu, Yinglun and Du, Simon Shaolei and Jamieson, Kevin and Nowak, Robert D}, + date = {2023-06-16}, + title = {LabelBench: A Comprehensive Framework for Benchmarking Label-Efficient Learning}, + doi = {10.48550/ARXIV.2306.09910}, + eprint = {2306.09910}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhang2023a - LabelBench_ a Comprehensive Framework for Benchmarking Label Efficient Learning.pdf:PDF}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Ding2023, + author = {Ding, Jiayu and Ma, Shuming and Dong, Li and Zhang, Xingxing and Huang, Shaohan and Wang, Wenhui and Wei, Furu}, + date = {2023-07-05}, + title = {LongNet: Scaling Transformers to 1,000,000,000 Tokens}, + doi = {10.48550/ARXIV.2307.02486}, + eprint = {2307.02486}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Ding2023 - LongNet_ Scaling Transformers to 1,000,000,000 Tokens.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2023}, +} + +@InProceedings{Tworkowski2023, + author = {Tworkowski, Szymon and Staniszewski, Konrad and Pacek, Mikołaj and Wu, Yuhuai and Michalewski, Henryk and Miłoś, Piotr}, + booktitle = {Thirty-seventh Conference on Neural Information Processing Systems}, + title = {Focused Transformer: Contrastive Training for Context Scaling}, + doi = {10.48550/arxiv.2307.03170}, + eprint = {2307.03170}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + publisher = {arXiv}, + file = {:Tworkowski2023 - Focused Transformer_ Contrastive Training for Context Scaling.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Vincent2015, + author = {Vincent, Pascal and de Brébisson, Alexandre and Bouthillier, Xavier}, + title = {Efficient Exact Gradient Update for training Deep Networks with Very Large Sparse Targets}, + doi = {10.48550/arxiv.1412.7091}, + eprint = {1412.7091}, + eprintclass = {cs.NE}, + eprinttype = {arXiv}, + volume = {28}, + file = {:Vincent2015 - Efficient Exact Gradient Update for Training Deep Networks with Very Large Sparse Targets.pdf:PDF}, + journal = {Advances in Neural Information Processing Systems}, + keywords = {Neural and Evolutionary Computing (cs.NE), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + year = {2015}, +} + +@InProceedings{Liang2017, + author = {Liang, Xuezhi and Wang, Xiaobo and Lei, Zhen and Liao, Shengcai and Li, Stan Z.}, + booktitle = {Neural Information Processing}, + title = {Soft-Margin Softmax for Deep Classification}, + editor = {Liu, Derong and Xie, Shengli and Li, Yuanqing and Zhao, Dongbin and El-Alfy, El-Sayed M.}, + isbn = {978-3-319-70096-0}, + pages = {413--421}, + publisher = {Springer International Publishing}, + address = {Cham}, + file = {:Liang2017 - Soft Margin Softmax for Deep Classification.pdf:PDF}, + year = {2017}, +} + +@Article{Banerjee2020, + author = {Kunal Banerjee and Vishak Prasad C. and Rishi Raj Gupta and Karthik Vyas and Anushree H. and Biswajit Mishra}, + title = {Exploring Alternatives to Softmax Function}, + doi = {10.48550/arxiv.2011.11538}, + eprint = {2011.11538}, + eprinttype = {arXiv}, + volume = {abs/2011.11538}, + copyright = {Creative Commons Attribution Share Alike 4.0 International}, + file = {:Banerjee2020 - Exploring Alternatives to Softmax Function.pdf:PDF}, + journal = {CoRR}, + year = {2020}, +} + +@InProceedings{Brebisson2016, + author = {Alexandre de Br{\'{e}}bisson and Pascal Vincent}, + booktitle = {4th International Conference on Learning Representations, {ICLR} 2016, San Juan, Puerto Rico, May 2-4, 2016, Conference Track Proceedings}, + title = {An Exploration of Softmax Alternatives Belonging to the Spherical Loss Family}, + doi = {10.48550/arxiv.1511.05042}, + editor = {Yoshua Bengio and Yann LeCun}, + eprint = {1511.05042}, + eprintclass = {cs.NE}, + eprinttype = {arXiv}, + file = {:Brebisson2016 - An Exploration of Softmax Alternatives Belonging to the Spherical Loss Family.pdf:PDF}, + keywords = {Neural and Evolutionary Computing (cs.NE), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + year = {2016}, +} + +@InProceedings{Solodskikh2023, + author = {Solodskikh, Kirill and Kurbanov, Azim and Aydarkhanov, Ruslan and Zhelavskaya, Irina and Parfenov, Yury and Song, Dehua and Lefkimmiatis, Stamatios}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title = {Integral Neural Networks}, + pages = {16113-16122}, + url = {https://openaccess.thecvf.com/content/CVPR2023/html/Solodskikh_Integral_Neural_Networks_CVPR_2023_paper.html}, + file = {:Solodskikh2023 - Integral Neural Networks.pdf:PDF}, + month = {June}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{He2023a, + author = {He, Muyang and Yang, Shuo and Huang, Tiejun and Zhao, Bo}, + date = {2023-06-08}, + title = {Large-scale Dataset Pruning with Dynamic Uncertainty}, + doi = {10.48550/ARXIV.2306.05175}, + eprint = {2306.05175}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:He2023a - Large Scale Dataset Pruning with Dynamic Uncertainty.pdf:PDF}, + groups = {Coreset for FL}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@InCollection{Paszke2019, + author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith}, + booktitle = {Advances in Neural Information Processing Systems 32}, + title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library}, + pages = {8024--8035}, + publisher = {Curran Associates, Inc.}, + year = {2019}, +} + +@Misc{Wightman2019, + author = {Ross Wightman}, + title = {PyTorch Image Models}, + doi = {10.5281/zenodo.4414861}, + howpublished = {\url{https://github.com/rwightman/pytorch-image-models}}, + journal = {GitHub repository}, + publisher = {GitHub}, + year = {2019}, +} + +@Article{Bertsch2023, + author = {Bertsch, Amanda and Alon, Uri and Neubig, Graham and Gormley, Matthew R.}, + date = {2023-05-02}, + title = {Unlimiformer: Long-Range Transformers with Unlimited Length Input}, + doi = {10.48550/ARXIV.2305.01625}, + eprint = {2305.01625}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Bertsch2023 - Unlimiformer_ Long Range Transformers with Unlimited Length Input.pdf:PDF}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Sun2023, + author = {Sun, Yutao and Dong, Li and Huang, Shaohan and Ma, Shuming and Xia, Yuqing and Xue, Jilong and Wang, Jianyong and Wei, Furu}, + date = {2023-07-17}, + title = {Retentive Network: A Successor to Transformer for Large Language Models}, + doi = {10.48550/ARXIV.2307.08621}, + eprint = {2307.08621}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Sun2023 - Retentive Network_ a Successor to Transformer for Large Language Models.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@InProceedings{Chopard2021, + author = {Daphn{\'{e}} Chopard and Matthias S. Treder and Irena Spasi{\'{c}}}, + booktitle = {Proceedings of the Second Workshop on Insights from Negative Results in {NLP}}, + date = {2021}, + title = {Learning Data Augmentation Schedules for Natural Language Processing}, + doi = {10.18653/v1/2021.insights-1.14}, + publisher = {Association for Computational Linguistics}, + file = {:Chopard2021 - Learning Data Augmentation Schedules for Natural Language Processing.pdf:PDF}, +} + +@Article{Wei2021, + author = {Wei, Jason and Huang, Chengyu and Vosoughi, Soroush and Cheng, Yu and Xu, Shiqi}, + date = {2021-03-12}, + title = {Few-Shot Text Classification with Triplet Networks, Data Augmentation, and Curriculum Learning}, + doi = {10.18653/v1/2021.naacl-main.434}, + eprint = {2103.07552}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Wei2021 - Few Shot Text Classification with Triplet Networks, Data Augmentation, and Curriculum Learning.pdf:PDF}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + publisher = {Association for Computational Linguistics}, + year = {2021}, +} + +@InProceedings{Ye2021, + author = {Seonghyeon Ye and Jiseon Kim and Alice Oh}, + booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, + date = {2021}, + title = {Efficient Contrastive Learning via Novel Data Augmentation and Curriculum Learning}, + doi = {10.18653/v1/2021.emnlp-main.138}, + publisher = {Association for Computational Linguistics}, + file = {:Ye2021 - Efficient Contrastive Learning Via Novel Data Augmentation and Curriculum Learning.pdf:PDF}, +} + +@Article{Tan2021, + author = {Tan, Mingxing and Le, Quoc V.}, + date = {2021-04-01}, + journaltitle = {International Conference on Machine Learning, 2021}, + title = {EfficientNetV2: Smaller Models and Faster Training}, + doi = {10.48550/ARXIV.2104.00298}, + eprint = {2104.00298}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Tan2021 - EfficientNetV2_ Smaller Models and Faster Training.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2021}, +} + +@Article{Ahn2023, + author = {Ahn, Sumyeong and Ko, Jongwoo and Yun, Se-Young}, + date = {2023-02-10}, + title = {CUDA: Curriculum of Data Augmentation for Long-Tailed Recognition}, + doi = {10.48550/ARXIV.2302.05499}, + eprint = {2302.05499}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Ahn2023 - CUDA_ Curriculum of Data Augmentation for Long Tailed Recognition.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Chen2023a, + author = {Chen, Yongjie and Liu, Hongmin and Yin, Haoran and Fan, Bin}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {Building Vision Transformers with Hierarchy Aware Feature Aggregation}, + pages = {5908-5918}, + file = {:Chen2023a - Building Vision Transformers with Hierarchy Aware Feature Aggregation.pdf:PDF}, + month = {October}, + year = {2023}, +} + +@InProceedings{Psomas2023, + author = {Psomas, Bill and Kakogeorgiou, Ioannis and Karantzalos, Konstantinos and Avrithis, Yannis}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {Keep It SimPool: Who Said Supervised Transformers Suffer from Attention Deficit?}, + pages = {5350-5360}, + file = {:Psomas2023 - Keep It SimPool_ Who Said Supervised Transformers Suffer from Attention Deficit_.pdf:PDF}, + month = {October}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Darcet2023, + author = {Darcet, Timothée and Oquab, Maxime and Mairal, Julien and Bojanowski, Piotr}, + date = {2023-09-28}, + title = {Vision Transformers Need Registers}, + doi = {10.48550/ARXIV.2309.16588}, + eprint = {2309.16588}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + abstract = {Transformers have recently emerged as a powerful tool for learning visual representations. In this paper, we identify and characterize artifacts in feature maps of both supervised and self-supervised ViT networks. The artifacts correspond to high-norm tokens appearing during inference primarily in low-informative background areas of images, that are repurposed for internal computations. We propose a simple yet effective solution based on providing additional tokens to the input sequence of the Vision Transformer to fill that role. We show that this solution fixes that problem entirely for both supervised and self-supervised models, sets a new state of the art for self-supervised visual models on dense visual prediction tasks, enables object discovery methods with larger models, and most importantly leads to smoother feature maps and attention maps for downstream visual processing.}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Darcet2023 - Vision Transformers Need Registers.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Chen2023b, + author = {Chen, Xiaohui and Wang, Yinkai and Du, Yuanqi and Hassoun, Soha and Liu, Li-Ping}, + date = {2023-09-22}, + title = {On Separate Normalization in Self-supervised Transformers}, + doi = {10.48550/ARXIV.2309.12931}, + eprint = {2309.12931}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Chen2023b - On Separate Normalization in Self Supervised Transformers.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Fan2023, + author = {Fan, Qihang and Huang, Huaibo and Chen, Mingrui and Liu, Hongmin and He, Ran}, + date = {2023-09-20}, + title = {RMT: Retentive Networks Meet Vision Transformers}, + doi = {10.48550/ARXIV.2309.11523}, + eprint = {2309.11523}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Fan2023 - RMT_ Retentive Networks Meet Vision Transformers.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Ma2023, + author = {Ma, Wenxuan and Li, Shuang and Zhang, JinMing and Liu, Chi Harold and Kang, Jingxuan and Wang, Yulin and Huang, Gao}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {Borrowing Knowledge From Pre-trained Language Model: A New Data-efficient Visual Learning Paradigm}, + pages = {18786-18797}, + file = {:Ma2023 - Borrowing Knowledge from Pre Trained Language Model_ a New Data Efficient Visual Learning Paradigm.pdf:PDF}, + month = {October}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Faiz2023, + author = {Faiz, Ahmad and Kaneda, Sotaro and Wang, Ruhan and Osi, Rita and Sharma, Parteek and Chen, Fan and Jiang, Lei}, + date = {2023-09-25}, + title = {LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language Models}, + doi = {10.48550/ARXIV.2309.14393}, + eprint = {2309.14393}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Faiz2023 - LLMCarbon_ Modeling the End to End Carbon Footprint of Large Language Models.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Computers and Society (cs.CY), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Zhou2023, + author = {Zhou, Chong and Loy, Chen Change and Dai, Bo}, + date = {2023-09-19}, + title = {Interpret Vision Transformers as ConvNets with Dynamic Convolutions}, + doi = {10.48550/ARXIV.2309.10713}, + eprint = {2309.10713}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhou2023 - Interpret Vision Transformers As ConvNets with Dynamic Convolutions.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Xu2023, + author = {Xu, Yixing and Li, Chao and Li, Dong and Sheng, Xiao and Jiang, Fan and Tian, Lu and Sirasao, Ashish}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {FDViT: Improve the Hierarchical Architecture of Vision Transformer}, + pages = {5950-5960}, + file = {:Xu2023 - FDViT_ Improve the Hierarchical Architecture of Vision Transformer.pdf:PDF}, + month = {October}, + year = {2023}, +} + +@InProceedings{Zhao2023, + author = {Zhao, Bingyin and Yu, Zhiding and Lan, Shiyi and Cheng, Yutao and Anandkumar, Anima and Lao, Yingjie and Alvarez, Jose M.}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {Fully Attentional Networks with Self-emerging Token Labeling}, + pages = {5585-5595}, + file = {:Zhao2023 - Fully Attentional Networks with Self Emerging Token Labeling.pdf:PDF}, + month = {October}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@InProceedings{Devlin2019, + author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, + booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, + date = {2018-10-11}, + title = {BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, + doi = {10.18653/v1/N19-1423}, + editor = {Burstein, Jill and Doran, Christy and Solorio, Thamar}, + eprint = {1810.04805}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + pages = {4171--4186}, + publisher = {Association for Computational Linguistics}, + address = {Minneapolis, Minnesota}, + file = {:Devlin2019 - BERT_ Pre Training of Deep Bidirectional Transformers for Language Understanding.pdf:PDF}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + month = jun, + year = {2019}, +} + +@Article{Dai2022, + author = {Dai, Jifeng and Shi, Min and Wang, Weiyun and Wu, Sitong and Xing, Linjie and Wang, Wenhai and Zhu, Xizhou and Lu, Lewei and Zhou, Jie and Wang, Xiaogang and Qiao, Yu and Hu, Xiaowei}, + date = {2022-11-10}, + title = {Demystify Transformers & Convolutions in Modern Image Deep Networks}, + doi = {10.48550/ARXIV.2211.05781}, + eprint = {2211.05781}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Dai2022 - Demystify Transformers & Convolutions in Modern Image Deep Networks.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Han2021, + author = {Han, Qi and Fan, Zejia and Dai, Qi and Sun, Lei and Cheng, Ming-Ming and Liu, Jiaying and Wang, Jingdong}, + title = {Demystifying local vision transformer: Sparse connectivity, weight sharing, and dynamic weight}, + number = {3}, + volume = {2}, + file = {:Han2021 - Demystifying Local Vision Transformer_ Sparse Connectivity, Weight Sharing, and Dynamic Weight.pdf:PDF}, + journal = {arXiv preprint arXiv:2106.04263}, + year = {2021}, +} + +@Article{Bozic2023, + author = {Bozic, Vukasin and Dordevic, Danilo and Coppola, Daniele and Thommes, Joseph}, + title = {Rethinking Attention: Exploring Shallow Feed-Forward Neural Networks as an Alternative to Attention Layers in Transformers}, + doi = {10.48550/ARXIV.2311.10642}, + eprint = {2311.10642}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Bozic2023 - Rethinking Attention_ Exploring Shallow Feed Forward Neural Networks As an Alternative to Attention Layers in Transformers.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{He2023b, + author = {He, Bobby and Hofmann, Thomas}, + date = {2023-11-03}, + title = {Simplifying Transformer Blocks}, + doi = {10.48550/ARXIV.2311.01906}, + eprint = {2311.01906}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:He2023b - Simplifying Transformer Blocks.pdf:PDF}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Zaheer2020, + author = {Zaheer, Manzil and Guruganesh, Guru and Dubey, Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and Ahmed, Amr}, + booktitle = {Proceedings of the 34th International Conference on Neural Information Processing Systems}, + title = {Big Bird: Transformers for Longer Sequences}, + isbn = {9781713829546}, + location = {Vancouver, BC, Canada}, + publisher = {Curran Associates Inc.}, + series = {NIPS'20}, + address = {Red Hook, NY, USA}, + articleno = {1450}, + file = {:Zaheer2020 - Big Bird_ Transformers for Longer Sequences.pdf:PDF}, + numpages = {15}, + year = {2020}, +} + +@Article{Child2019, + author = {Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya}, + date = {2019-04-23}, + title = {Generating Long Sequences with Sparse Transformers}, + doi = {10.48550/ARXIV.1904.10509}, + eprint = {1904.10509}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Child2019 - Generating Long Sequences with Sparse Transformers.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2019}, +} + +@InProceedings{Qiu2023, + author = {Qiu, Yuwei and Zhang, Kaihao and Wang, Chenxi and Luo, Wenhan and Li, Hongdong and Jin, Zhi}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {MB-TaylorFormer: Multi-Branch Efficient Transformer Expanded by Taylor Formula for Image Dehazing}, + pages = {12802-12813}, + file = {:Qiu2023 - MB TaylorFormer_ Multi Branch Efficient Transformer Expanded by Taylor Formula for Image Dehazing.pdf:PDF}, + month = {October}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2023}, +} + +@InProceedings{Iwana2019, + author = {Brian Kenji Iwana and Ryohei Kuroki and Seiichi Uchida}, + booktitle = {Proceedings - 2019 International Conference on Computer Vision Workshop, ICCVW 2019}, + title = {Explaining Convolutional Neural Networks using Softmax Gradient Layer-wise Relevance Propagation}, + doi = {10.1109/ICCVW.2019.00513}, + eprint = {1908.04351}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + language = {English}, + pages = {4176--4185}, + publisher = {Institute of Electrical and Electronics Engineers Inc.}, + series = {Proceedings - 2019 International Conference on Computer Vision Workshop, ICCVW 2019}, + address = {United States}, + file = {:Iwana2019 - Explaining Convolutional Neural Networks Using Softmax Gradient Layer Wise Relevance Propagation.pdf:PDF}, + keywords = {cs.CV, cs.LG, cs.NE}, + month = oct, + year = {2019}, +} + +@Article{Montavon2015, + author = {Montavon, Grégoire and Bach, Sebastian and Binder, Alexander and Samek, Wojciech and Müller, Klaus-Robert}, + date = {2015-12-08}, + journaltitle = {Pattern Recognition}, + title = {Explaining NonLinear Classification Decisions with Deep Taylor Decomposition}, + doi = {10.1016/j.patcog.2016.11.008}, + eprint = {1512.02479}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + issn = {0031-3203}, + pages = {211--222}, + volume = {65}, + file = {:Montavon2015 - Explaining NonLinear Classification Decisions with Deep Taylor Decomposition.pdf:PDF}, + journal = {Pattern Recognition}, + keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + month = may, + publisher = {Elsevier BV}, + year = {2015}, +} + + +@InProceedings{Wang2019, + author = {Wang, Xiaodi and Li, Ce and Mou, Yipeng and Zhang, Baochang and Han, Jungong and Liu, Jianzhuang}, + booktitle = {2019 IEEE Winter Conference on Applications of Computer Vision (WACV)}, + date = {2019-01}, + title = {Taylor Convolutional Networks for Image Classification}, + doi = {10.1109/wacv.2019.00140}, + publisher = {IEEE}, + file = {:Wang2019 - Taylor Convolutional Networks for Image Classification.pdf:PDF}, +} + +@Misc{Zhao2023a, + author = {Hongjue Zhao and Yizhuo Chen and Dachun Sun and Yingdong Hu and Kaizhao Liang and Yanbing Mao and Lui Sha and Huajie Shao}, + title = {TaylorNet: A Taylor-Driven Generic Neural Architecture}, + file = {:Zhao2023a - TaylorNet_ a Taylor Driven Generic Neural Architecture.pdf:PDF}, + year = {2023}, +} + + +@Article{Xing2020, + author = {Xing, Changda and Wang, Meiling and Dong, Chong and Duan, Chaowei and Wang, Zhisheng}, + date = {2020-08}, + journaltitle = {Neurocomputing}, + title = {Using Taylor Expansion and Convolutional Sparse Representation for Image Fusion}, + doi = {10.1016/j.neucom.2020.04.002}, + issn = {0925-2312}, + pages = {437--455}, + volume = {402}, + file = {:Xing2020 - Using Taylor Expansion and Convolutional Sparse Representation for Image Fusion.pdf:PDF}, + journal = {Neurocomputing}, + publisher = {Elsevier BV}, + year = {2020}, +} + + +@InProceedings{Gaikwad2018, + author = {Gaikwad, Akash Sunil and El-Sharkawy, Mohamed}, + booktitle = {2018 IEEE International Symposium on Signal Processing and Information Technology (ISSPIT)}, + date = {2018-12}, + title = {Pruning convolution neural network (squeezenet) using taylor expansion-based criterion}, + doi = {10.1109/isspit.2018.8705095}, + publisher = {IEEE}, + file = {:Gaikwad2018 - Pruning Convolution Neural Network (squeezenet) Using Taylor Expansion Based Criterion.pdf:PDF}, + year = {2018}, +} + +@InProceedings{Molchanov2017, + author = {Pavlo Molchanov and Stephen Tyree and Tero Karras and Timo Aila and Jan Kautz}, + booktitle = {International Conference on Learning Representations}, + title = {Pruning Convolutional Neural Networks for Resource Efficient Inference}, + doi = {10.48550/arxiv.1611.06440}, + eprint = {1611.06440}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + file = {:Molchanov2017 - Pruning Convolutional Neural Networks for Resource Efficient Inference.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2017}, +} + +@InProceedings{Schaeffer2023, + author = {Rylan Schaeffer and Brando Miranda and Sanmi Koyejo}, + booktitle = {Thirty-seventh Conference on Neural Information Processing Systems}, + title = {Are Emergent Abilities of Large Language Models a Mirage?}, + doi = {10.48550/arxiv.2304.15004}, + eprint = {2304.15004}, + eprintclass = {cs.AI}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=ITw9edRDlD}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Schaeffer2023 - Are Emergent Abilities of Large Language Models a Mirage_.pdf:PDF}, + keywords = {Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + year = {2023}, +} + +@InProceedings{Nivron2023, + author = {Omer Nivron and Raghul Parthipan and Damon Wischik}, + booktitle = {ICML Workshop on New Frontiers in Learning, Control, and Dynamical Systems}, + title = {Taylorformer: Probabalistic Modelling for Random Processes including Time Series}, + doi = {10.48550/arxiv.2305.19141}, + eprint = {2305.19141}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Nivron2023 - Taylorformer_ Probabalistic Modelling for Random Processes Including Time Series.pdf:PDF}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + year = {2023}, +} + +@InProceedings{Nauen2025, + author = {Tobias Christian Nauen and Sebastian Palacio and Andreas Dengel}, + booktitle = {Proceedings of the Winter Conference on Applications of Computer Vision (WACV)}, + title = {Which Transformer to Favor: A Comparative Analysis of Efficiency in Vision Transformers}, + eprint = {2308.09372}, + pages = {6955-6966}, + archiveprefix = {arXiv}, + month = {February}, + primaryclass = {cs.CV}, + year = {2025}, +} + +@Article{Bulatov2023, + author = {Bulatov, Aydar and Kuratov, Yuri and Burtsev, Mikhail S.}, + date = {2023-04-19}, + title = {Scaling Transformer to 1M tokens and beyond with RMT}, + doi = {10.48550/ARXIV.2304.11062}, + eprint = {2304.11062}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + file = {:Bulatov2023 - Scaling Transformer to 1M Tokens and beyond with RMT.pdf:PDF}, + year = {2023}, +} + +@InProceedings{Gu2022, + author = {Albert Gu and Karan Goel and Christopher Re}, + booktitle = {International Conference on Learning Representations}, + title = {Efficiently Modeling Long Sequences with Structured State Spaces}, + doi = {10.48550/arxiv.2111.00396}, + eprint = {2111.00396}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + file = {:Gu2022 - Efficiently Modeling Long Sequences with Structured State Spaces.pdf:PDF}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Maas2011, + author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, + booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, + title = {Learning Word Vectors for Sentiment Analysis}, + editor = {Lin, Dekang and Matsumoto, Yuji and Mihalcea, Rada}, + pages = {142--150}, + publisher = {Association for Computational Linguistics}, + address = {Portland, Oregon, USA}, + file = {:Maas2011 - Learning Word Vectors for Sentiment Analysis.pdf:PDF}, + month = {June}, + year = {2011}, +} + +@InProceedings{Dass2023, + author = {J. Dass and S. Wu and H. Shi and C. Li and Z. Ye and Z. Wang and Y. Lin}, + booktitle = {2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)}, + title = {ViTALiTy: Unifying Low-rank and Sparse Approximation for Vision Transformer Acceleration with a Linear Taylor Attention}, + doi = {10.1109/HPCA56546.2023.10071081}, + pages = {415-428}, + publisher = {IEEE Computer Society}, + address = {Los Alamitos, CA, USA}, + keywords = {training;costs;systematics;approximation algorithms;transformers;boosting;sparse representation}, + month = {mar}, + year = {2023}, +} + +@InProceedings{Nangia2018, + author = {Nangia, Nikita and Bowman, Samuel}, + booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Student Research Workshop}, + title = {{L}ist{O}ps: A Diagnostic Dataset for Latent Tree Learning}, + doi = {10.18653/v1/N18-4013}, + editor = {Cordeiro, Silvio Ricardo and Oraby, Shereen and Pavalanathan, Umashanthi and Rim, Kyeongmin}, + pages = {92--99}, + publisher = {Association for Computational Linguistics}, + address = {New Orleans, Louisiana, USA}, + month = jun, + year = {2018}, +} + +@Misc{Chen2024, + author = {Chen, Hongzhan and Quan, Xiaojun and Chen, Hehong and Yan, Ming and Zhang, Ji}, + date = {2024}, + title = {Knowledge Distillation for Closed-Source Language Models}, + doi = {10.48550/ARXIV.2401.07013}, + eprint = {2401.07013}, + eprinttype = {arxiv}, + file = {:Chen2024 - Knowledge Distillation for Closed Source Language Models.pdf:PDF}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences}, +} + +@Misc{ElNouby2024, + author = {El-Nouby, Alaaeldin and Klein, Michal and Zhai, Shuangfei and Bautista, Miguel Angel and Toshev, Alexander and Shankar, Vaishaal and Susskind, Joshua M and Joulin, Armand}, + date = {2024}, + title = {Scalable Pre-training of Large Autoregressive Image Models}, + doi = {10.48550/ARXIV.2401.08541}, + eprint = {2401.08541}, + eprinttype = {arxiv}, + file = {:ElNouby2024 - Scalable Pre Training of Large Autoregressive Image Models.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + priority = {prio3}, +} + +@Misc{Heddes2024, + author = {Heddes, Mike and Srinivasa, Narayan and Givargis, Tony and Nicolau, Alexandru}, + date = {2024}, + title = {Always-Sparse Training by Growing Connections with Guided Stochastic Exploration}, + doi = {10.48550/ARXIV.2401.06898}, + eprint = {2401.06898}, + eprinttype = {arxiv}, + file = {:Heddes2024 - Always Sparse Training by Growing Connections with Guided Stochastic Exploration.pdf:PDF}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, +} + +@Misc{Li2024, + author = {Y Li, Y Lei, X Yang}, + date = {2024}, + title = {Spikeformer: Training high-performance spiking neural network with transformer}, + url = {https://www.sciencedirect.com/science/article/pii/S092523122400050X}, + urldate = {2024-02-05}, +} + +@Misc{Owen2024, + author = {Owen, David}, + date = {2024}, + title = {How predictable is language model benchmark performance?}, + doi = {10.48550/ARXIV.2401.04757}, + eprint = {2401.04757}, + eprinttype = {arxiv}, + file = {:Owen2024 - How Predictable Is Language Model Benchmark Performance_.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences}, + priority = {prio3}, +} + +@Misc{Zhou2024, + author = {Zhou, Xingyu and Zhang, Leheng and Zhao, Xiaorui and Wang, Keze and Li, Leida and Gu, Shuhang}, + date = {2024}, + title = {Video Super-Resolution Transformer with Masked Inter \& Intra-Frame Attention}, + doi = {10.48550/ARXIV.2401.06312}, + eprint = {2401.06312}, + eprinttype = {arxiv}, + file = {:Zhou2024 - Video Super Resolution Transformer with Masked Inter & Intra Frame Attention.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, +} + +@Misc{Godey2024, + author = {Godey, Nathan and de la Clergerie, Éric and Sagot, Benoît}, + date = {2024}, + title = {Anisotropy Is Inherent to Self-Attention in Transformers}, + doi = {10.48550/ARXIV.2401.12143}, + eprint = {2401.12143}, + eprinttype = {arxiv}, + file = {:Godey2024 - Anisotropy Is Inherent to Self Attention in Transformers.pdf:PDF}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, +} + +@Misc{Li2024a, + author = {Li, Mukai and Li, Lei and Yin, Yuwei and Ahmed, Masood and Liu, Zhenguang and Liu, Qi}, + date = {2024}, + title = {Red Teaming Visual Language Models}, + doi = {10.48550/ARXIV.2401.12915}, + eprint = {2401.12915}, + eprinttype = {arxiv}, + file = {:Li2024a - Red Teaming Visual Language Models.pdf:PDF}, + keywords = {Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, +} + +@Misc{Zhang2024, + author = {Zhang, Yiyuan and Ding, Xiaohan and Gong, Kaixiong and Ge, Yixiao and Shan, Ying and Yue, Xiangyu}, + date = {2024}, + title = {Multimodal Pathway: Improve Transformers with Irrelevant Data from Other Modalities}, + doi = {10.48550/ARXIV.2401.14405}, + eprint = {2401.14405}, + eprinttype = {arxiv}, + file = {:Zhang2024 - Multimodal Pathway_ Improve Transformers with Irrelevant Data from Other Modalities.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, +} + +@Misc{Lee2024, + author = {Lee, Seungho and Kang, Seoungyoon and Shim, Hyunjung}, + date = {2024}, + title = {Self-Supervised Vision Transformers Are Efficient Segmentation Learners for Imperfect Labels}, + doi = {10.48550/ARXIV.2401.12535}, + eprint = {2401.12535}, + eprinttype = {arxiv}, + file = {:Lee2024 - Self Supervised Vision Transformers Are Efficient Segmentation Learners for Imperfect Labels.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, +} + +@InProceedings{Alayrac2022, + author = {Jean-Baptiste Alayrac and Jeff Donahue and Pauline Luc and Antoine Miech and Iain Barr and Yana Hasson and Karel Lenc and Arthur Mensch and Katherine Millican and Malcolm Reynolds and Roman Ring and Eliza Rutherford and Serkan Cabi and Tengda Han and Zhitao Gong and Sina Samangooei and Marianne Monteiro and Jacob Menick and Sebastian Borgeaud and Andrew Brock and Aida Nematzadeh and Sahand Sharifzadeh and Mikolaj Binkowski and Ricardo Barreira and Oriol Vinyals and Andrew Zisserman and Karen Simonyan}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {Flamingo: a Visual Language Model for Few-Shot Learning}, + editor = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho}, + url = {https://openreview.net/forum?id=EbMuimAbPbs}, + creationdate = {2024-02-06T15:02:51}, + file = {:Alayrac2022 - Flamingo_ a Visual Language Model for Few Shot Learning.pdf:PDF}, + year = {2022}, +} + +@InProceedings{Krause2013, + author = {Jonathan Krause and Michael Stark and Jia Deng and Li Fei-Fei}, + booktitle = {4th International IEEE Workshop on 3D Representation and Recognition (3dRR-13)}, + title = {3D Object Representations for Fine-Grained Categorization}, + address = {Sydney, Australia}, + comment = {Stanford Cars Dataset}, + year = {2013}, +} + +@InProceedings{Nilsback2008, + author = {Maria-Elena Nilsback and Andrew Zisserman}, + booktitle = {Indian Conference on Computer Vision, Graphics and Image Processing}, + title = {Automated Flower Classification over a Large Number of Classes}, + comment = {Oxford Flowers 102 Dataset}, + month = {Dec}, + year = {2008}, +} + +@InProceedings{Zhou2014, + author = {Zhou, Bolei and Lapedriza, Agata and Xiao, Jianxiong and Torralba, Antonio and Oliva, Aude}, + booktitle = {Advances in Neural Information Processing Systems}, + date = {2014}, + title = {Learning Deep Features for Scene Recognition using Places Database}, + editor = {Z. Ghahramani and M. Welling and C. Cortes and N. Lawrence and K.Q. Weinberger}, + publisher = {Curran Associates, Inc.}, + url = {https://proceedings.neurips.cc/paper_files/paper/2014/file/3fe94a002317b5f9259f82690aeea4cd-Paper.pdf}, + volume = {27}, + comment = {MIT Places365 Dataset}, + creationdate = {2024-02-08T11:29:18}, + year = {2014}, +} + +@InProceedings{Wang2020a, + author = {Wang, Hanrui and Wu, Zhanghao and Liu, Zhijian and Cai, Han and Zhu, Ligeng and Gan, Chuang and Han, Song}, + booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, + title = {{HAT}: Hardware-Aware Transformers for Efficient Natural Language Processing}, + doi = {10.18653/v1/2020.acl-main.686}, + editor = {Jurafsky, Dan and Chai, Joyce and Schluter, Natalie and Tetreault, Joel}, + pages = {7675--7688}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2020.acl-main.686}, + address = {Online}, + file = {:Wang2020a - HAT_ Hardware Aware Transformers for Efficient Natural Language Processing.pdf:PDF}, + month = jul, + year = {2020}, +} + +@InProceedings{Tabani2021, + author = {Tabani, Hamid and Balasubramaniam, Ajay and Marzban, Shabbir and Arani, Elahe and Zonooz, Bahram}, + booktitle = {2021 24th Euromicro Conference on Digital System Design (DSD)}, + title = {Improving the Efficiency of Transformers for Resource-Constrained Devices}, + doi = {10.1109/DSD53832.2021.00074}, + pages = {449-456}, + file = {:Tabani2021 - Improving the Efficiency of Transformers for Resource Constrained Devices.pdf:PDF}, + keywords = {Performance evaluation;Computer vision;Digital systems;Memory management;Transformers;Data transfer;Natural language processing;Deep Learning;Transformers;Clustering;Resource-Constrained Devices}, + priority = {prio3}, + year = {2021}, +} + +@Misc{Torpey2024, + author = {Torpey, David and Klein, Richard}, + date = {2024}, + title = {Affine transformation estimation improves visual self-supervised learning}, + doi = {10.48550/ARXIV.2402.09071}, + eprint = {2402.09071}, + eprinttype = {arxiv}, + creationdate = {2024-02-19T14:49:01}, + file = {:auto/Torpey2024 - Affine_transformation_estimation_improves_visual_self-supervised_learning.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, +} + +@Article{Wu2018, + author = {Wu, Shuang and Li, Guoqi and Deng, Lei and Liu, Liu and Xie, Yuan and Shi, Luping}, + date = {2018-02-27}, + journaltitle = {IEEE Transactions on Neural Networks and Learning Systems}, + title = {L1-Norm Batch Normalization for Efficient Training of Deep Neural Networks}, + doi = {10.1109/tnnls.2018.2876179}, + eprint = {1802.09769}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + issn = {2162-2388}, + number = {7}, + pages = {2043--2051}, + volume = {30}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Wu2018 - L1 Norm Batch Normalization for Efficient Training of Deep Neural Networks.pdf:PDF}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + month = jul, + publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2018}, +} + +@Article{Arora2024, + author = {Arora, Simran and Eyuboglu, Sabri and Zhang, Michael and Timalsina, Aman and Alberti, Silas and Zinsley, Dylan and Zou, James and Rudra, Atri and Ré, Christopher}, + date = {2024-02-28}, + title = {Simple linear attention language models balance the recall-throughput tradeoff}, + doi = {10.48550/ARXIV.2402.18668}, + eprint = {2402.18668}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Zero v1.0 Universal}, + file = {:Arora2024 - Simple Linear Attention Language Models Balance the Recall Throughput Tradeoff.pdf:PDF}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@InProceedings{Nauen2024, + author = {Nauen, Tobias Christian and Palacio, Sebastian and Dengel, Andreas}, + booktitle = {Pattern Recognition}, + title = {TaylorShift: Shifting the Complexity of Self-attention from Squared to Linear (and Back) Using Taylor-Softmax}, + doi = {10.1007/978-3-031-78172-8_1}, + editor = {Antonacopoulos, Apostolos and Chaudhuri, Subhasis and Chellappa, Rama and Liu, Cheng-Lin and Bhattacharya, Saumik and Pal, Umapada}, + isbn = {978-3-031-78172-8}, + note = {ICPR 2024 (oral)}, + pages = {1--16}, + publisher = {Springer Nature Switzerland}, + address = {Cham}, + file = {:auto/Nauen2024 - TaylorShift__Shifting_the_Complexity_of_Self-Attention_from_Squared_to_Linear_(and_Back)_using_Taylor-Softmax.pdf:PDF}, + year = {2024}, +} + +@Article{Yang2024, + author = {Yang, Kai and Ackermann, Jan and He, Zhenyu and Feng, Guhao and Zhang, Bohang and Feng, Yunzhen and Ye, Qiwei and He, Di and Wang, Liwei}, + date = {2024-02-21}, + title = {Do Efficient Transformers Really Save Computation?}, + doi = {10.48550/ARXIV.2402.13934}, + eprint = {2402.13934}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + creationdate = {2024-03-11T14:19:39}, + file = {:Yang2024 - Do Efficient Transformers Really Save Computation_.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Zuo2022, + author = {Shuangquan Zuo and Yun Xiao and Xiaojun Chang and Xuanhong Wang}, + date = {2022-10}, + journaltitle = {Knowledge-Based Systems}, + title = {Vision transformers for dense prediction: A survey}, + doi = {10.1016/j.knosys.2022.109552}, + issn = {0950-7051}, + pages = {109552}, + url = {https://www.sciencedirect.com/science/article/pii/S0950705122007821}, + volume = {253}, + creationdate = {2024-03-11T16:43:35}, + file = {:Zuo2022 - Vision Transformers for Dense Prediction_ a Survey.pdf:PDF}, + journal = {Knowledge-Based Systems}, + keywords = {Deep learning, Transformer, Dense prediction, Computer vision}, + publisher = {Elsevier {BV}}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2022}, +} + +@Article{Strudel2021, + author = {Strudel, Robin and Garcia, Ricardo and Laptev, Ivan and Schmid, Cordelia}, + date = {2021-05-12}, + title = {Segmenter: Transformer for Semantic Segmentation}, + doi = {10.48550/ARXIV.2105.05633}, + eprint = {2105.05633}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + creationdate = {2024-03-12T16:21:00}, + file = {:Strudel2021 - Segmenter_ Transformer for Semantic Segmentation.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Chen2022a, + author = {Chen, Zhe and Duan, Yuchen and Wang, Wenhai and He, Junjun and Lu, Tong and Dai, Jifeng and Qiao, Yu}, + date = {2022-05-17}, + title = {Vision Transformer Adapter for Dense Predictions}, + doi = {10.48550/ARXIV.2205.08534}, + eprint = {2205.08534}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + abstract = {This work investigates a simple yet powerful dense prediction task adapter for Vision Transformer (ViT). Unlike recently advanced variants that incorporate vision-specific inductive biases into their architectures, the plain ViT suffers inferior performance on dense predictions due to weak prior assumptions. To address this issue, we propose the ViT-Adapter, which allows plain ViT to achieve comparable performance to vision-specific transformers. Specifically, the backbone in our framework is a plain ViT that can learn powerful representations from large-scale multi-modal data. When transferring to downstream tasks, a pre-training-free adapter is used to introduce the image-related inductive biases into the model, making it suitable for these tasks. We verify ViT-Adapter on multiple dense prediction tasks, including object detection, instance segmentation, and semantic segmentation. Notably, without using extra detection data, our ViT-Adapter-L yields state-of-the-art 60.9 box AP and 53.0 mask AP on COCO test-dev. We hope that the ViT-Adapter could serve as an alternative for vision-specific transformers and facilitate future research. The code and models will be released at https://github.com/czczup/ViT-Adapter.}, + copyright = {arXiv.org perpetual, non-exclusive license}, + creationdate = {2024-03-12T16:47:07}, + file = {:Chen2022a - Vision Transformer Adapter for Dense Predictions.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Teney2024, + author = {Damien Teney and Armand Nicolicioiu and Valentin Hartmann and Ehsan Abbasnejad}, + date = {2024-03-04}, + journaltitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2024}, + title = {Neural Redshift: Random Networks are not Random Functions}, + eprint = {2403.02241}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + creationdate = {2024-03-13T11:44:28}, + file = {:Teney2024 - Neural Redshift_ Random Networks Are Not Random Functions.pdf:PDF}, + keywords = {cs.LG, cs.AI, cs.CV}, + qualityassured = {qualityAssured}, + readstatus = {read}, +} + +@InProceedings{Zheng2023, + author = {Lin Zheng and Jianbo Yuan and Chong Wang and Lingpeng Kong}, + booktitle = {The Eleventh International Conference on Learning Representations}, + title = {Efficient Attention via Control Variates}, + doi = {10.48550/ARXIV.2302.04542}, + eprint = {2302.04542}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + file = {:Zheng2023 - Efficient Attention Via Control Variates.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@InProceedings{Peng2021a, + author = {Peng, Hao and Pappas, Nikolaos and Yogatama, Dani and Schwartz, Roy and Smith, Noah A. and Kong, Lingpeng}, + booktitle = {International Conference on Learning Representations}, + date = {2021-03-03}, + title = {Random Feature Attention}, + eprint = {2103.02143}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + file = {:Peng2021a - Random Feature Attention.pdf:PDF}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Xiong2022, + author = {Xiong, Wenhan and Oguz, Barlas and Gupta, Anchit and Chen, Xilun and Liskovich, Diana and Levy, Omer and Yih, Scott and Mehdad, Yashar}, + booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, + title = {Simple Local Attentions Remain Competitive for Long-Context Tasks}, + doi = {10.18653/v1/2022.naacl-main.144}, + editor = {Carpuat, Marine and de Marneffe, Marie-Catherine and Meza Ruiz, Ivan Vladimir}, + eprint = {2112.07210}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + pages = {1975--1986}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2022.naacl-main.144}, + address = {Seattle, United States}, + file = {:Xiong2021a - Simple Local Attentions Remain Competitive for Long Context Tasks.pdf:PDF}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + month = jul, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Zhang2023b, + author = {Zhang, Jun and Jiang, Shuyang and Feng, Jiangtao and Zheng, Lin and Kong, Lingpeng}, + booktitle = {International Conference on Machine Learning}, + title = {CAB: Comprehensive Attention Benchmarking on Long Sequence Modeling}, + eprint = {2210.07661}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + organization = {PMLR}, + pages = {41194--41218}, + file = {:Zhang2023b - CAB_ Comprehensive Attention Benchmarking on Long Sequence Modeling.pdf:PDF}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Kitaev2020, + author = {Kitaev, Nikita and Kaiser, Lukasz and Levskaya, Anselm}, + date = {2020-01-13}, + title = {Reformer: The Efficient Transformer}, + doi = {10.48550/ARXIV.2001.04451}, + eprint = {2001.04451}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + abstract = {Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its complexity from O($L^2$) to O($L\log L$), where $L$ is the length of the sequence. Furthermore, we use reversible residual layers instead of the standard residuals, which allows storing activations only once in the training process instead of $N$ times, where $N$ is the number of layers. The resulting model, the Reformer, performs on par with Transformer models while being much more memory-efficient and much faster on long sequences.}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Kitaev2020 - Reformer_ the Efficient Transformer.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2020}, +} + +@InProceedings{Radford2019, + author = {Alec Radford and Jeff Wu and Rewon Child and David Luan and Dario Amodei and Ilya Sutskever}, + title = {Language Models are Unsupervised Multitask Learners}, + url = {https://api.semanticscholar.org/CorpusID:160025533}, + year = {2019}, +} + +@Article{Li2023c, + author = {Li, Xiangtai and Ding, Henghui and Yuan, Haobo and Zhang, Wenwei and Pang, Jiangmiao and Cheng, Guangliang and Chen, Kai and Liu, Ziwei and Loy, Chen Change}, + date = {2023-04-19}, + title = {Transformer-Based Visual Segmentation: A Survey}, + doi = {10.48550/ARXIV.2304.09854}, + eprint = {2304.09854}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Li2023c - Transformer Based Visual Segmentation_ a Survey.pdf:PDF:http\://arxiv.org/pdf/2304.09854v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Quetu2023, + author = {Quétu, Victor and Milovanovic, Marta and Tartaglione, Enzo}, + date = {2023-07-26}, + title = {Sparse Double Descent in Vision Transformers: real or phantom threat?}, + doi = {10.1007/978-3-031-43153-1_41}, + eprint = {2307.14253}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + issn = {1611-3349}, + pages = {490--502}, + booktitle = {Lecture Notes in Computer Science}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Quetu2023 - Sparse Double Descent in Vision Transformers_ Real or Phantom Threat_.pdf:PDF:http\://arxiv.org/pdf/2307.14253v1}, + isbn = {9783031431531}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {Springer Nature Switzerland}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Frumkin2023, + author = {Frumkin, Natalia and Gope, Dibakar and Marculescu, Diana}, + date = {2023-08-21}, + title = {Jumping through Local Minima: Quantization in the Loss Landscape of Vision Transformers}, + doi = {10.48550/ARXIV.2308.10814}, + eprint = {2308.10814}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Share Alike 4.0 International}, + file = {:Frumkin2023 - Jumping through Local Minima_ Quantization in the Loss Landscape of Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2308.10814v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Liu2021a, + author = {Liu, Yahui and Sangineto, Enver and Bi, Wei and Sebe, Nicu and Lepri, Bruno and De Nadai, Marco}, + date = {2021-06-07}, + journaltitle = {Proceedings of the 35th Conference on Neural Information Processing Systems (NeurIPS) 2021}, + title = {Efficient Training of Visual Transformers with Small Datasets}, + doi = {10.48550/ARXIV.2106.03746}, + eprint = {2106.03746}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Liu2021a - Efficient Training of Visual Transformers with Small Datasets.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + year = {2021}, +} + +@Misc{Yang2024a, + author = {Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang}, + date = {2024-01-19}, + title = {Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data}, + doi = {10.48550/ARXIV.2401.10891}, + eprint = {2401.10891}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + abstract = {1. Introduction}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:http\://arxiv.org/pdf/2401.10891v2:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + number = {zero-/few-}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Misc{Aich2024, + author = {Aich, Abhishek and Suh, Yumin and Schulter, Samuel and Chandraker, Manmohan}, + date = {2024}, + title = {Progressive Token Length Scaling in Transformer Encoders for Efficient Universal Segmentation}, + doi = {10.48550/ARXIV.2404.14657}, + url = {https://arxiv.org/abs/2404.14657}, + urldate = {2024-04-25}, + creationdate = {2024-04-25}, + file = {:auto/Aich2024 - Progressive_Token_Length_Scaling_in_Transformer_Encoders_for_Efficient_Universal_Segmentation.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, +} + +@Article{Bafghi2024, + author = {Bafghi, Reza Akbarian and Harilal, Nidhin and Monteleoni, Claire and Raissi, Maziar}, + date = {2024-04-26}, + title = {Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting}, + doi = {10.48550/ARXIV.2404.17245}, + eprint = {2404.17245}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Share Alike 4.0 International}, + file = {:Bafghi2024 - Parameter Efficient Fine Tuning of Self Supervised ViTs without Catastrophic Forgetting.pdf:PDF:http\://arxiv.org/pdf/2404.17245v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@InProceedings{Zhu2021, + author = {Zhu, Chen and Ni, Renkun and Xu, Zheng and Kong, Kezhi and Huang, W. Ronny and Goldstein, Tom}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {GradInit: Learning to Initialize Neural Networks for Stable and Efficient Training}, + editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan}, + pages = {16410--16422}, + publisher = {Curran Associates, Inc.}, + url = {https://proceedings.neurips.cc/paper_files/paper/2021/file/88ae6372cfdc5df69a976e893f4d554b-Paper.pdf}, + volume = {34}, + file = {:Zhu2021 - GradInit_ Learning to Initialize Neural Networks for Stable and Efficient Training.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Skorski2021, + author = {Skorski, Maciej and Temperoni, Alessandro and Theobald, Martin}, + booktitle = {Proceedings of The 13th Asian Conference on Machine Learning}, + title = {Revisiting Weight Initialization of Deep Neural Networks}, + editor = {Balasubramanian, Vineeth N. and Tsang, Ivor}, + pages = {1192--1207}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + url = {https://proceedings.mlr.press/v157/skorski21a.html}, + volume = {157}, + file = {:Skorski2021 - Revisiting Weight Initialization of Deep Neural Networks.pdf:PDF}, + month = {17--19 Nov}, + pdf = {https://proceedings.mlr.press/v157/skorski21a/skorski21a.pdf}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Schuerholt2022, + author = {Schürholt, Konstantin and Knyazev, Boris and Giró-i-Nieto, Xavier and Borth, Damian}, + date = {2022-09-29}, + title = {Hyper-Representations as Generative Models: Sampling Unseen Neural Network Weights}, + doi = {10.48550/ARXIV.2209.14733}, + eprint = {2209.14733}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Schuerholt2022 - Hyper Representations As Generative Models_ Sampling Unseen Neural Network Weights.pdf:PDF:http\://arxiv.org/pdf/2209.14733v1}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Das2021, + author = {Das, Debasmit and Bhalgat, Yash and Porikli, Fatih}, + date = {2021-05-02}, + title = {Data-driven Weight Initialization with Sylvester Solvers}, + doi = {10.48550/ARXIV.2105.10335}, + eprint = {2105.10335}, + eprintclass = {cs.NE}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Das2021 - Data Driven Weight Initialization with Sylvester Solvers.pdf:PDF:http\://arxiv.org/pdf/2105.10335v1}, + keywords = {Neural and Evolutionary Computing (cs.NE), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Yang2022c, + author = {Yang, Yibo and Wang, Hong and Yuan, Haobo and Lin, Zhouchen}, + date = {2022-10-12}, + title = {Towards Theoretically Inspired Neural Initialization Optimization}, + doi = {10.48550/ARXIV.2210.05956}, + eprint = {2210.05956}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Yang2022c - Towards Theoretically Inspired Neural Initialization Optimization.pdf:PDF:http\://arxiv.org/pdf/2210.05956v1}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Geiping2022, + author = {Geiping, Jonas and Goldblum, Micah and Somepalli, Gowthami and Shwartz-Ziv, Ravid and Goldstein, Tom and Wilson, Andrew Gordon}, + date = {2022-10-12}, + title = {How Much Data Are Augmentations Worth? An Investigation into Scaling Laws, Invariance, and Implicit Regularization}, + doi = {10.48550/ARXIV.2210.06441}, + eprint = {2210.06441}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + note = {under review for ICLR 2023}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Geiping2022 - How Much Data Are Augmentations Worth_ an Investigation into Scaling Laws, Invariance, and Implicit Regularization.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Xu2023a, + author = {Xu, Jiarui and Liu, Sifei and Vahdat, Arash and Byeon, Wonmin and Wang, Xiaolong and De Mello, Shalini}, + date = {2023-03-08}, + title = {Open-Vocabulary Panoptic Segmentation with Text-to-Image Diffusion Models}, + doi = {10.48550/ARXIV.2303.04803}, + eprint = {2303.04803}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Xu2023a - Open Vocabulary Panoptic Segmentation with Text to Image Diffusion Models.pdf:PDF:http\://arxiv.org/pdf/2303.04803v4}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Min2022, + author = {Min, Seonwoo and Park, Nokyung and Kim, Siwon and Park, Seunghyun and Kim, Jinkyu}, + date = {2022-07-21}, + title = {Grounding Visual Representations with Texts for Domain Generalization}, + doi = {10.48550/ARXIV.2207.10285}, + eprint = {2207.10285}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Min2022 - Grounding Visual Representations with Texts for Domain Generalization.pdf:PDF:http\://arxiv.org/pdf/2207.10285v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Plummer2015, + author = {Plummer, Bryan A. and Wang, Liwei and Cervantes, Chris M. and Caicedo, Juan C. and Hockenmaier, Julia and Lazebnik, Svetlana}, + date = {2015-05-19}, + title = {Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models}, + doi = {10.48550/ARXIV.1505.04870}, + eprint = {1505.04870}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Plummer2015 - Flickr30k Entities_ Collecting Region to Phrase Correspondences for Richer Image to Sentence Models.pdf:PDF:http\://arxiv.org/pdf/1505.04870v4}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2015}, +} + +@Article{Frankle2018, + author = {Frankle, Jonathan and Carbin, Michael}, + date = {2018-03-09}, + journaltitle = {ICLR 2019}, + title = {The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks}, + doi = {10.48550/ARXIV.1803.03635}, + eprint = {1803.03635}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Frankle2018 - The Lottery Ticket Hypothesis_ Finding Sparse, Trainable Neural Networks.pdf:PDF:http\://arxiv.org/pdf/1803.03635v5}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2018}, +} + +@Article{Lee2024a, + author = {Lee, Hojun and Kim, Suyoung and Lee, Junhoo and Yoo, Jaeyoung and Kwak, Nojun}, + date = {2024-04-14}, + title = {Coreset Selection for Object Detection}, + doi = {10.48550/ARXIV.2404.09161}, + eprint = {2404.09161}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Lee2024a - Coreset Selection for Object Detection.pdf:PDF:http\://arxiv.org/pdf/2404.09161v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Wu2023, + author = {Wu, Xindi and Zhang, Byron and Deng, Zhiwei and Russakovsky, Olga}, + date = {2023-08-15}, + title = {Vision-Language Dataset Distillation}, + doi = {10.48550/ARXIV.2308.07545}, + eprint = {2308.07545}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Wu2023 - Vision Language Dataset Distillation.pdf:PDF:http\://arxiv.org/pdf/2308.07545v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Nakkiran2020, + author = {Preetum Nakkiran and Gal Kaplun and Yamini Bansal and Tristan Yang and Boaz Barak and Ilya Sutskever}, + booktitle = {International Conference on Learning Representations}, + title = {Deep Double Descent: Where Bigger Models and More Data Hurt}, + eprint = {1912.02292}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + publisher = {arXiv}, + url = {https://openreview.net/forum?id=B1g5sA4twr}, + file = {:Nakkiran2020 - Deep Double Descent_ Where Bigger Models and More Data Hurt.pdf:PDF:http\://arxiv.org/pdf/1912.02292v1}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), Neural and Evolutionary Computing (cs.NE), Machine Learning (stat.ML), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@Article{Laurencon2024, + author = {Laurençon, Hugo and Tronchon, Léo and Cord, Matthieu and Sanh, Victor}, + date = {2024-05-03}, + title = {What matters when building vision-language models?}, + doi = {10.48550/ARXIV.2405.02246}, + eprint = {2405.02246}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Laurencon2024 - What Matters When Building Vision Language Models_.pdf:PDF:http\://arxiv.org/pdf/2405.02246v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Cheekati2024, + author = {Cheekati, Shravan}, + date = {2024-05-02}, + title = {Early Transformers: A study on Efficient Training of Transformer Models through Early-Bird Lottery Tickets}, + doi = {10.48550/ARXIV.2405.02353}, + eprint = {2405.02353}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Share Alike 4.0 International}, + file = {:Cheekati2024 - Early Transformers_ a Study on Efficient Training of Transformer Models through Early Bird Lottery Tickets.pdf:PDF:http\://arxiv.org/pdf/2405.02353v1}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Lavoie2024, + author = {Lavoie, Samuel and Kirichenko, Polina and Ibrahim, Mark and Assran, Mahmoud and Wildon, Andrew Gordon and Courville, Aaron and Ballas, Nicolas}, + date = {2024-04-30}, + title = {Modeling Caption Diversity in Contrastive Vision-Language Pretraining}, + doi = {10.48550/ARXIV.2405.00740}, + eprint = {2405.00740}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Lavoie2024 - Modeling Caption Diversity in Contrastive Vision Language Pretraining.pdf:PDF:http\://arxiv.org/pdf/2405.00740v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Schaeffer2023a, + author = {Schaeffer, Rylan and Khona, Mikail and Robertson, Zachary and Boopathy, Akhilan and Pistunova, Kateryna and Rocks, Jason W. and Fiete, Ila Rani and Koyejo, Oluwasanmi}, + date = {2023-03-24}, + title = {Double Descent Demystified: Identifying, Interpreting & Ablating the Sources of a Deep Learning Puzzle}, + doi = {10.48550/ARXIV.2303.14151}, + eprint = {2303.14151}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Schaeffer2023a - Double Descent Demystified_ Identifying, Interpreting & Ablating the Sources of a Deep Learning Puzzle.pdf:PDF}, + keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Curth2023, + author = {Curth, Alicia and Jeffares, Alan and van der Schaar, Mihaela}, + date = {2023-10-29}, + title = {A U-turn on Double Descent: Rethinking Parameter Counting in Statistical Learning}, + doi = {10.48550/ARXIV.2310.18988}, + eprint = {2310.18988}, + eprintclass = {stat.ML}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Curth2023 - A U Turn on Double Descent_ Rethinking Parameter Counting in Statistical Learning.pdf:PDF}, + keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Patro2023a, + author = {Patro, Badri N. and Agneeswaran, Vijay Srinivas}, + date = {2023-11-02}, + title = {Scattering Vision Transformer: Spectral Mixing Matters}, + doi = {10.48550/ARXIV.2311.01310}, + eprint = {2311.01310}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Patro2023a - Scattering Vision Transformer_ Spectral Mixing Matters.pdf:PDF:http\://arxiv.org/pdf/2311.01310v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Image and Video Processing (eess.IV), Signal Processing (eess.SP), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@InProceedings{Guibas2022, + author = {John Guibas and Morteza Mardani and Zongyi Li and Andrew Tao and Anima Anandkumar and Bryan Catanzaro}, + booktitle = {International Conference on Learning Representations}, + title = {Efficient Token Mixing for Transformers via Adaptive Fourier Neural Operators}, + eprint = {2111.13587}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=EXHG-A3jlM}, + file = {:Guibas2021 - Adaptive Fourier Neural Operators_ Efficient Token Mixers for Transformers.pdf:PDF:http\://arxiv.org/pdf/2111.13587v2}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Patro2023b, + author = {Patro, Badri N. and Namboodiri, Vinay P. and Agneeswaran, Vijay Srinivas}, + date = {2023-04-13}, + title = {SpectFormer: Frequency and Attention is what you need in a Vision Transformer}, + doi = {10.48550/ARXIV.2304.06446}, + eprint = {2304.06446}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Patro2023b - SpectFormer_ Frequency and Attention Is What You Need in a Vision Transformer.pdf:PDF:http\://arxiv.org/pdf/2304.06446v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Beck2024, + author = {Beck, Maximilian and Pöppel, Korbinian and Spanring, Markus and Auer, Andreas and Prudnikova, Oleksandra and Kopp, Michael and Klambauer, Günter and Brandstetter, Johannes and Hochreiter, Sepp}, + date = {2024-05-07}, + title = {xLSTM: Extended Long Short-Term Memory}, + doi = {10.48550/ARXIV.2405.04517}, + eprint = {2405.04517}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Beck2024 - XLSTM_ Extended Long Short Term Memory.pdf:PDF:http\://arxiv.org/pdf/2405.04517v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Babiloni2023, + author = {Francesca Babiloni and Ioannis Marras and Jiankang Deng and Filippos Kokkinos and Matteo Maggioni and Grigorios Chrysos and Philip Torr and Stefanos Zafeiriou}, + title = {Linear Complexity Self-Attention with 3rd Order Polynomials}, + doi = {10.1109/tpami.2022.3231971}, + pages = {1--12}, + file = {:Babiloni2023 - Linear Complexity Self Attention with 3rd Order Polynomials.pdf:PDF;:Babiloni2023 - Linear Complexity Self Attention with _3$$_text$$rd$$$$_ Order Polynomials.pdf:PDF}, + journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence}, + keywords = {self-attention, non-local blocks, transformers, polynomial expansion, neural networks}, + publisher = {Institute of Electrical and Electronics Engineers ({IEEE})}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Yu2024, + author = {Yu, Weihao and Wang, Xinchao}, + date = {2024-05-13}, + title = {MambaOut: Do We Really Need Mamba for Vision?}, + doi = {10.48550/ARXIV.2405.07992}, + eprint = {2405.07992}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Yu2024 - MambaOut_ Do We Really Need Mamba for Vision_.pdf:PDF:http\://arxiv.org/pdf/2405.07992v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Wei2024, + author = {Wei, Zihao and Pan, Zixuan and Owens, Andrew}, + date = {2024-05-14}, + title = {Efficient Vision-Language Pre-training by Cluster Masking}, + doi = {10.48550/ARXIV.2405.08815}, + eprint = {2405.08815}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + abstract = {We propose a simple strategy for masking image patches during visual-language contrastive learning that improves the quality of the learned representations and the training speed. During each iteration of training, we randomly mask clusters of visually similar image patches, as measured by their raw pixel intensities. This provides an extra learning signal, beyond the contrastive training itself, since it forces a model to predict words for masked visual structures solely from context. It also speeds up training by reducing the amount of data used in each image. We evaluate the effectiveness of our model by pre-training on a number of benchmarks, finding that it outperforms other masking strategies, such as FLIP, on the quality of the learned representation.}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Wei2024 - Efficient Vision Language Pre Training by Cluster Masking.pdf:PDF:http\://arxiv.org/pdf/2405.08815v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Vasu2024, + author = {Vasu, Pavan Kumar Anasosalu and Pouransari, Hadi and Faghri, Fartash and Tuzel, Oncel}, + date = {2024-05-14}, + title = {CLIP with Quality Captions: A Strong Pretraining for Vision Tasks}, + doi = {10.48550/ARXIV.2405.08911}, + eprint = {2405.08911}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Vasu2024 - CLIP with Quality Captions_ a Strong Pretraining for Vision Tasks.pdf:PDF:http\://arxiv.org/pdf/2405.08911v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Vasu2023, + author = {Vasu, Pavan Kumar Anasosalu and Pouransari, Hadi and Faghri, Fartash and Vemulapalli, Raviteja and Tuzel, Oncel}, + date = {2023-11-28}, + title = {MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced Training}, + doi = {10.48550/ARXIV.2311.17049}, + eprint = {2311.17049}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + file = {:Vasu2023 - MobileCLIP_ Fast Image Text Models through Multi Modal Reinforced Training.pdf:PDF:http\://arxiv.org/pdf/2311.17049v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Pan2024, + author = {Pan, Xu and Philip, Aaron and Xie, Ziqian and Schwartz, Odelia}, + date = {2024-04-04}, + title = {Dissecting Query-Key Interaction in Vision Transformers}, + doi = {10.48550/ARXIV.2405.14880}, + eprint = {2405.14880}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Pan2024 - Dissecting Query Key Interaction in Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2405.14880v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Yadkori2024, + author = {Yadkori, Yasin Abbasi and Kuzborskij, Ilja and György, András and Szepesvári, Csaba}, + date = {2024-06-04}, + title = {To Believe or Not to Believe Your LLM}, + doi = {10.48550/ARXIV.2406.02543}, + eprint = {2406.02543}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Yadkori2024 - To Believe or Not to Believe Your LLM.pdf:PDF:http\://arxiv.org/pdf/2406.02543v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Zhang2024a, + author = {Zhang, Shuoxi and Liu, Hanpeng and Lin, Stephen and He, Kun}, + date = {2024-06-01}, + title = {You Only Need Less Attention at Each Stage in Vision Transformers}, + doi = {10.48550/ARXIV.2406.00427}, + eprint = {2406.00427}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhang2024a - You Only Need Less Attention at Each Stage in Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2406.00427v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Rezaei2024, + author = {Rezaei, Razieh and Sabet, Masoud Jalili and Gu, Jindong and Rueckert, Daniel and Torr, Philip and Khakzar, Ashkan}, + date = {2024-06-05}, + title = {Learning Visual Prompts for Guiding the Attention of Vision Transformers}, + doi = {10.48550/ARXIV.2406.03303}, + eprint = {2406.03303}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Rezaei2024 - Learning Visual Prompts for Guiding the Attention of Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2406.03303v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@InProceedings{Park2024, + author = {Sungho Park and Hyeran Byun}, + booktitle = {Computer Vision and Pattern Recognition (CVPR)}, + title = {{Fair-VPT:} Fair Visual Prompt Tuning for Image Classification}, + file = {:Park2024 - Fair VPT_ Fair Visual Prompt Tuning for Image Classification.pdf:PDF}, + priority = {prio3}, + year = {2024}, +} + +@InProceedings{Hu2024CVPR, + author = {Xinting Hu and Li Jiang and Bernt Schiele}, + booktitle = {Computer Vision and Pattern Recognition (CVPR)}, + title = {Training Vision Transformers for Semi-Supervised Semantic Segmentation}, + file = {:Hu2024CVPR - Training Vision Transformers for Semi Supervised Semantic Segmentation.pdf:PDF}, + priority = {prio3}, + year = {2024}, +} + +@Article{Yang2024b, + author = {Yang, Songlin and Wang, Bailin and Zhang, Yu and Shen, Yikang and Kim, Yoon}, + date = {2024-06-10}, + title = {Parallelizing Linear Transformers with the Delta Rule over Sequence Length}, + doi = {10.48550/ARXIV.2406.06484}, + eprint = {2406.06484}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Zero v1.0 Universal}, + file = {:Yang2024b - Parallelizing Linear Transformers with the Delta Rule Over Sequence Length.pdf:PDF:http\://arxiv.org/pdf/2406.06484v1}, + keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Mahmoud2023, + author = {Mahmoud, Anas and Elhoushi, Mostafa and Abbas, Amro and Yang, Yu and Ardalani, Newsha and Leather, Hugh and Morcos, Ari}, + date = {2023-10-03}, + title = {Sieve: Multimodal Dataset Pruning Using Image Captioning Models}, + doi = {10.48550/ARXIV.2310.02110}, + eprint = {2310.02110}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Mahmoud2023 - Sieve_ Multimodal Dataset Pruning Using Image Captioning Models.pdf:PDF:http\://arxiv.org/pdf/2310.02110v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Nguyen2024, + author = {Nguyen, Duy-Kien and Assran, Mahmoud and Jain, Unnat and Oswald, Martin R. and Snoek, Cees G. M. and Chen, Xinlei}, + date = {2024-06-13}, + title = {An Image is Worth More Than 16x16 Patches: Exploring Transformers on Individual Pixels}, + doi = {10.48550/ARXIV.2406.09415}, + eprint = {2406.09415}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Nguyen2024 - An Image Is Worth More Than 16x16 Patches_ Exploring Transformers on Individual Pixels.pdf:PDF:http\://arxiv.org/pdf/2406.09415v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Jiang2024, + author = {Jiang, Zhixing and Yin, Dennis and Khoda, Elham E and Loncar, Vladimir and Govorkova, Ekaterina and Moreno, Eric and Harris, Philip and Hauck, Scott and Hsu, Shih-Chieh}, + date = {2024-02-01}, + journaltitle = {Machine Learning and the Physical Sciences Workshop, NeurIPS 2023}, + title = {Ultra Fast Transformers on FPGAs for Particle Physics Experiments}, + doi = {10.48550/ARXIV.2402.01047}, + eprint = {2402.01047}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Jiang2024 - Ultra Fast Transformers on FPGAs for Particle Physics Experiments.pdf:PDF:http\://arxiv.org/pdf/2402.01047v1}, + keywords = {Machine Learning (cs.LG), Hardware Architecture (cs.AR), High Energy Physics - Experiment (hep-ex), FOS: Computer and information sciences, FOS: Physical sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@InProceedings{Li2020, + author = {Li, Bingbing and Pandey, Santosh and Fang, Haowen and Lyv, Yanjun and Li, Ji and Chen, Jieyang and Xie, Mimi and Wan, Lipeng and Liu, Hang and Ding, Caiwen}, + booktitle = {Proceedings of the ACM/IEEE International Symposium on Low Power Electronics and Design}, + title = {Ftrans: energy-efficient acceleration of transformers using fpga}, + eprint = {2007.08563}, + eprintclass = {cs.DC}, + eprinttype = {arXiv}, + pages = {175--180}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Li2020 - FTRANS_ Energy Efficient Acceleration of Transformers Using FPGA.pdf:PDF:http\://arxiv.org/pdf/2007.08563v1}, + keywords = {Distributed / Parallel / Cluster Computing (cs.DC), Machine Learning (cs.LG), FOS: Computer and information sciences, C.1.4}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@InProceedings{Peng2021b, + author = {Peng, Hongwu and Huang, Shaoyi and Geng, Tong and Li, Ang and Jiang, Weiwen and Liu, Hang and Wang, Shusen and Ding, Caiwen}, + booktitle = {2021 22nd International Symposium on Quality Electronic Design (ISQED)}, + title = {Accelerating Transformer-based Deep Learning Models on FPGAs using Column Balanced Block Pruning}, + doi = {10.1109/ISQED51717.2021.9424344}, + pages = {142-148}, + file = {:Peng2021b - Accelerating Transformer Based Deep Learning Models on FPGAs Using Column Balanced Block Pruning.pdf:PDF}, + keywords = {Training;Deep learning;Graphics processing units;Parallel processing;Hardware;Natural language processing;Sparse matrices;Transformer;deep learning;pruning;acceleration;FPGA}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Devaguptapu2024, + author = {Devaguptapu, Chaitanya and Aithal, Sumukh and Ramasubramanian, Shrinivas and Yamada, Moyuru and Kaul, Manohar}, + date = {2024-06-18}, + title = {Semantic Graph Consistency: Going Beyond Patches for Regularizing Self-Supervised Vision Transformers}, + doi = {10.48550/ARXIV.2406.12944}, + eprint = {2406.12944}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Devaguptapu2024 - Semantic Graph Consistency_ Going beyond Patches for Regularizing Self Supervised Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2406.12944v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Cheon2024, + author = {Cheon, Minjong}, + date = {2024-06-21}, + title = {Demonstrating the Efficacy of Kolmogorov-Arnold Networks in Vision Tasks}, + doi = {10.48550/ARXIV.2406.14916}, + eprint = {2406.14916}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Cheon2024 - Demonstrating the Efficacy of Kolmogorov Arnold Networks in Vision Tasks.pdf:PDF:http\://arxiv.org/pdf/2406.14916v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Huang2024, + author = {Huang, Brandon and Mitra, Chancharik and Arbelle, Assaf and Karlinsky, Leonid and Darrell, Trevor and Herzig, Roei}, + date = {2024-06-21}, + title = {Multimodal Task Vectors Enable Many-Shot Multimodal In-Context Learning}, + doi = {10.48550/ARXIV.2406.15334}, + eprint = {2406.15334}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Huang2024 - Multimodal Task Vectors Enable Many Shot Multimodal in Context Learning.pdf:PDF:http\://arxiv.org/pdf/2406.15334v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Lu2021, + author = {Lu, Jiachen and Yao, Jinghan and Zhang, Junge and Zhu, Xiatian and Xu, Hang and Gao, Weiguo and Xu, Chunjing and Xiang, Tao and Zhang, Li}, + date = {2021-10-22}, + title = {SOFT: Softmax-free Transformer with Linear Complexity}, + doi = {10.48550/ARXIV.2110.11945}, + eprint = {2110.11945}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Lu2021 - SOFT_ Softmax Free Transformer with Linear Complexity.pdf:PDF:http\://arxiv.org/pdf/2110.11945v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Lou2024, + author = {Lou, Chao and Jia, Zixia and Zheng, Zilong and Tu, Kewei}, + date = {2024-06-24}, + title = {Sparser is Faster and Less is More: Efficient Sparse Attention for Long-Range Transformers}, + doi = {10.48550/ARXIV.2406.16747}, + eprint = {2406.16747}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Lou2024 - Sparser Is Faster and Less Is More_ Efficient Sparse Attention for Long Range Transformers.pdf:PDF:http\://arxiv.org/pdf/2406.16747v1}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@InProceedings{You2023, + author = {You, Haoran and Shi, Huihong and Guo, Yipin and Lin, Yingyan}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {ShiftAddViT: Mixture of Multiplication Primitives Towards Efficient Vision Transformer}, + editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine}, + pages = {33319--33337}, + publisher = {Curran Associates, Inc.}, + url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/69c49f75ca31620f1f0d38093d9f3d9b-Paper-Conference.pdf}, + volume = {36}, + file = {:You2023 - ShiftAddViT_ Mixture of Multiplication Primitives Towards Efficient Vision Transformer.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Han2023a, + author = {Han, Dongchen and Pan, Xuran and Han, Yizeng and Song, Shiji and Huang, Gao}, + date = {2023-08-01}, + title = {FLatten Transformer: Vision Transformer using Focused Linear Attention}, + doi = {10.48550/ARXIV.2308.00442}, + eprint = {2308.00442}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Han2023a - FLatten Transformer_ Vision Transformer Using Focused Linear Attention.pdf:PDF:http\://arxiv.org/pdf/2308.00442v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Nandam2024, + author = {Nandam, Srinivasa Rao and Atito, Sara and Feng, Zhenhua and Kittler, Josef and Awais, Muhammad}, + date = {2024-06-25}, + title = {Investigating Self-Supervised Methods for Label-Efficient Learning}, + doi = {10.48550/ARXIV.2406.17460}, + eprint = {2406.17460}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Nandam2024 - Investigating Self Supervised Methods for Label Efficient Learning.pdf:PDF:http\://arxiv.org/pdf/2406.17460v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@InProceedings{Guo2024, + author = {Guo, Jialong and Chen, Xinghao and Tang, Yehui and Wang, Yunhe}, + booktitle = {International Conference on Machine Learning}, + title = {SLAB: Efficient Transformers with Simplified Linear Attention and Progressive Re-parameterized Batch Normalization}, + eprint = {2405.11582}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + file = {:Guo2024 - SLAB_ Efficient Transformers with Simplified Linear Attention and Progressive Re Parameterized Batch Normalization.pdf:PDF:http\://arxiv.org/pdf/2405.11582v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@InProceedings{Ma2024, + author = {Xu Ma and Xiyang Dai and Jianwei Yang and Bin Xiao and Yinpeng Chen and Yun Fu and Lu Yuan}, + booktitle = {The Twelfth International Conference on Learning Representations}, + title = {Efficient Modulation for Vision Networks}, + eprint = {2403.19963}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=ip5LHJs6QX}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Ma2024 - Efficient Modulation for Vision Networks.pdf:PDF:http\://arxiv.org/pdf/2403.19963v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + year = {2024}, +} + +@Article{Hu2024, + author = {Hu, Youbing and Cheng, Yun and Lu, Anqi and Cao, Zhiqiang and Wei, Dawei and Liu, Jie and Li, Zhijun}, + date = {2024-01-08}, + title = {LF-ViT: Reducing Spatial Redundancy in Vision Transformer for Efficient Image Recognition}, + doi = {10.48550/ARXIV.2402.00033}, + eprint = {2402.00033}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:http\://arxiv.org/pdf/2402.00033v1:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Wang2024, + author = {Wang, Qian-Wei and Xie, Yuqiu and Zhang, Letian and Liu, Zimo and Xia, Shu-Tao}, + date = {2024-05-23}, + title = {Pre-Trained Vision-Language Models as Partial Annotators}, + doi = {10.48550/ARXIV.2406.18550}, + eprint = {2406.18550}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Wang2024 - Pre Trained Vision Language Models As Partial Annotators.pdf:PDF:http\://arxiv.org/pdf/2406.18550v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Xu2023b, + author = {Xu, Xuwei and Wang, Sen and Chen, Yudong and Zheng, Yanping and Wei, Zhewei and Liu, Jiajun}, + date = {2023-11-06}, + title = {GTP-ViT: Efficient Vision Transformers via Graph-based Token Propagation}, + doi = {10.48550/ARXIV.2311.03035}, + eprint = {2311.03035}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + note = {WACV2024 oral}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Xu2023b - GTP ViT_ Efficient Vision Transformers Via Graph Based Token Propagation.pdf:PDF:http\://arxiv.org/pdf/2311.03035v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@InProceedings{Vasu2023a, + author = {Vasu, Pavan Kumar Anasosalu and Gabriel, James and Zhu, Jeff and Tuzel, Oncel and Ranjan, Anurag}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {FastViT: A Fast Hybrid Vision Transformer Using Structural Reparameterization}, + eprint = {2303.14189}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {5785-5795}, + file = {:Vasu2023a - FastViT_ a Fast Hybrid Vision Transformer Using Structural Reparameterization.pdf:PDF:http\://arxiv.org/pdf/2303.14189v2}, + month = {October}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@InProceedings{Shaker2023, + author = {Shaker, Abdelrahman and Maaz, Muhammad and Rasheed, Hanoona and Khan, Salman and Yang, Ming-Hsuan and Khan, Fahad Shahbaz}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications}, + eprint = {2303.15446}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + file = {:Shaker2023 - SwiftFormer_ Efficient Additive Attention for Transformer Based Real Time Mobile Vision Applications.pdf:PDF:http\://arxiv.org/pdf/2303.15446v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Wang2023, + author = {Wang, Ao and Chen, Hui and Lin, Zijia and Han, Jungong and Ding, Guiguang}, + date = {2023-07-18}, + title = {RepViT: Revisiting Mobile CNN From ViT Perspective}, + doi = {10.48550/ARXIV.2307.09283}, + eprint = {2307.09283}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + note = {CVPR 2024}, + file = {:Wang2023 - RepViT_ Revisiting Mobile CNN from ViT Perspective.pdf:PDF:http\://arxiv.org/pdf/2307.09283v8}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Zhang2021, + author = {Qinglong Zhang and Yu-Bin Yang}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {ResT: An Efficient Transformer for Visual Recognition}, + editor = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan}, + eprint = {2105.13677}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + publisher = {arXiv}, + url = {https://openreview.net/forum?id=6Ab68Ip4Mu}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Zhang2021 - ResT_ an Efficient Transformer for Visual Recognition.pdf:PDF:http\://arxiv.org/pdf/2105.13677v5}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Beyer2020, + author = {Beyer, Lucas and Hénaff, Olivier J. and Kolesnikov, Alexander and Zhai, Xiaohua and Oord, Aäron van den}, + date = {2020-06-12}, + title = {Are we done with ImageNet?}, + doi = {10.48550/ARXIV.2006.07159}, + eprint = {2006.07159}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Beyer2020 - Are We Done with ImageNet_.pdf:PDF:http\://arxiv.org/pdf/2006.07159v1}, + groups = {Coreset for FL}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@InProceedings{Bolya2022, + author = {Bolya, Daniel and Fu, Cheng-Yang and Dai, Xiaoliang and Zhang, Peizhao and Hoffman, Judy}, + booktitle = {Computer Vision -- ECCV 2022 Workshops}, + title = {Hydra Attention: Efficient Attention with Many Heads}, + editor = {Karlinsky, Leonid and Michaeli, Tomer and Nishino, Ko}, + isbn = {978-3-031-25082-8}, + pages = {35--49}, + publisher = {Springer Nature Switzerland}, + address = {Cham}, + file = {:Bolya2022 - Hydra Attention_ Efficient Attention with Many Heads.pdf:PDF:http\://arxiv.org/pdf/2209.07484v1}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Zhang2023c, + author = {Xiaosong Zhang and Yunjie Tian and Lingxi Xie and Wei Huang and Qi Dai and Qixiang Ye and Qi Tian}, + booktitle = {The Eleventh International Conference on Learning Representations}, + title = {HiViT: A Simpler and More Efficient Design of Hierarchical Vision Transformer}, + eprint = {2205.14949}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + url = {https://openreview.net/forum?id=3F6I-0-57SC}, + file = {:Zhang2022b - HiViT_ Hierarchical Vision Transformer Meets Masked Image Modeling.pdf:PDF:http\://arxiv.org/pdf/2205.14949v1}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Li2022b, + author = {Li, Jiashi and Xia, Xin and Li, Wei and Li, Huixia and Wang, Xing and Xiao, Xuefeng and Wang, Rui and Zheng, Min and Pan, Xin}, + date = {2022-07-12}, + title = {Next-ViT: Next Generation Vision Transformer for Efficient Deployment in Realistic Industrial Scenarios}, + doi = {10.48550/ARXIV.2207.05501}, + eprint = {2207.05501}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Li2022b - Next ViT_ Next Generation Vision Transformer for Efficient Deployment in Realistic Industrial Scenarios.pdf:PDF:http\://arxiv.org/pdf/2207.05501v4}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Cai2023, + author = {Cai, Han and Li, Junyan and Hu, Muyan and Gan, Chuang and Han, Song}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {EfficientViT: Lightweight Multi-Scale Attention for High-Resolution Dense Prediction}, + eprint = {2205.14756}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {17302-17313}, + file = {:Cai2022 - EfficientViT_ Multi Scale Linear Attention for High Resolution Dense Prediction.pdf:PDF:http\://arxiv.org/pdf/2205.14756v6}, + month = {October}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@InProceedings{Zhou2021, + author = {Haoyi Zhou and Shanghang Zhang and Jieqi Peng and Shuai Zhang and Jianxin Li and Hui Xiong and Wancai Zhang}, + booktitle = {Thirty-Fifth {AAAI} Conference on Artificial Intelligence, {AAAI} 2021, Thirty-Third Conference on Innovative Applications of Artificial Intelligence, {IAAI} 2021, The Eleventh Symposium on Educational Advances in Artificial Intelligence, {EAAI} 2021, Virtual Event, February 2-9, 2021}, + title = {Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting}, + doi = {10.1609/AAAI.V35I12.17325}, + eprint = {2012.07436}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + pages = {11106--11115}, + publisher = {{AAAI} Press}, + url = {https://doi.org/10.1609/aaai.v35i12.17325}, + file = {:Zhou2020 - Informer_ beyond Efficient Transformer for Long Sequence Time Series Forecasting.pdf:PDF:http\://arxiv.org/pdf/2012.07436v3}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Information Retrieval (cs.IR), FOS: Computer and information sciences}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Shao2024, + author = {Shao, Tong and Tian, Zhuotao and Zhao, Hang and Su, Jingyong}, + date = {2024-07-11}, + title = {Explore the Potential of CLIP for Training-Free Open Vocabulary Semantic Segmentation}, + doi = {10.48550/ARXIV.2407.08268}, + eprint = {2407.08268}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Shao2024 - Explore the Potential of CLIP for Training Free Open Vocabulary Semantic Segmentation.pdf:PDF:http\://arxiv.org/pdf/2407.08268v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2024}, +} + +@Article{Simoncini2024, + author = {Simoncini, Walter and Gidaris, Spyros and Bursuc, Andrei and Asano, Yuki M.}, + date = {2024-07-15}, + title = {No Train, all Gain: Self-Supervised Gradients Improve Deep Frozen Representations}, + doi = {10.48550/ARXIV.2407.10964}, + eprint = {2407.10964}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Simoncini2024 - No Train, All Gain_ Self Supervised Gradients Improve Deep Frozen Representations.pdf:PDF:http\://arxiv.org/pdf/2407.10964v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Wang2024a, + author = {Wang, Yancheng and Yang, Yingzhen}, + date = {2024-07-21}, + title = {Efficient Visual Transformer by Learnable Token Merging}, + doi = {10.48550/ARXIV.2407.15219}, + eprint = {2407.15219}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Wang2024a - Efficient Visual Transformer by Learnable Token Merging.pdf:PDF:http\://arxiv.org/pdf/2407.15219v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Chen2018, + author = {Chen, Ricky T. Q. and Rubanova, Yulia and Bettencourt, Jesse and Duvenaud, David}, + date = {2018-06-19}, + title = {Neural Ordinary Differential Equations}, + doi = {10.48550/ARXIV.1806.07366}, + eprint = {1806.07366}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Chen2018 - Neural Ordinary Differential Equations.pdf:PDF:http\://arxiv.org/pdf/1806.07366v5}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2018}, +} + +@Article{Wang2024b, + author = {Wang, Haoqi and Zhang, Tong and Salzmann, Mathieu}, + date = {2024-07-23}, + title = {SINDER: Repairing the Singular Defects of DINOv2}, + doi = {10.48550/ARXIV.2407.16826}, + eprint = {2407.16826}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Wang2024b - SINDER_ Repairing the Singular Defects of DINOv2.pdf:PDF:http\://arxiv.org/pdf/2407.16826v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Zhang2024b, + author = {Zhang, Tianxiao and Xu, Wenju and Luo, Bo and Wang, Guanghui}, + date = {2024-07-28}, + title = {Depth-Wise Convolutions in Vision Transformers for Efficient Training on Small Datasets}, + doi = {10.48550/ARXIV.2407.19394}, + eprint = {2407.19394}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhang2024b - Depth Wise Convolutions in Vision Transformers for Efficient Training on Small Datasets.pdf:PDF:http\://arxiv.org/pdf/2407.19394v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Qi2023, + author = {Qi, Xianbiao and Wang, Jianan and Chen, Yihao and Shi, Yukai and Zhang, Lei}, + date = {2023-04-19}, + title = {LipsFormer: Introducing Lipschitz Continuity to Vision Transformers}, + doi = {10.48550/ARXIV.2304.09856}, + eprint = {2304.09856}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Qi2023 - LipsFormer_ Introducing Lipschitz Continuity to Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2304.09856v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Shalam2024, + author = {Shalam, Daniel and Korman, Simon}, + date = {2024-08-04}, + title = {Unsupervised Representation Learning by Balanced Self Attention Matching}, + doi = {10.48550/ARXIV.2408.02014}, + eprint = {2408.02014}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Shalam2024 - Unsupervised Representation Learning by Balanced Self Attention Matching.pdf:PDF:http\://arxiv.org/pdf/2408.02014v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + readstatus = {read}, + year = {2024}, +} + +@Article{Foret2020, + author = {Foret, Pierre and Kleiner, Ariel and Mobahi, Hossein and Neyshabur, Behnam}, + date = {2020-10-03}, + title = {Sharpness-Aware Minimization for Efficiently Improving Generalization}, + doi = {10.48550/ARXIV.2010.01412}, + eprint = {2010.01412}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Foret2020 - Sharpness Aware Minimization for Efficiently Improving Generalization.pdf:PDF:http\://arxiv.org/pdf/2010.01412v3}, + keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@Article{Gou2024, + author = {Gou, Chenhui and Felemban, Abdulwahab and Khan, Faizan Farooq and Zhu, Deyao and Cai, Jianfei and Rezatofighi, Hamid and Elhoseiny, Mohamed}, + date = {2024-08-07}, + title = {How Well Can Vision Language Models See Image Details?}, + doi = {10.48550/ARXIV.2408.03940}, + eprint = {2408.03940}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Gou2024 - How Well Can Vision Language Models See Image Details_.pdf:PDF:http\://arxiv.org/pdf/2408.03940v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Zhang2024c, + author = {Zhang, Tianfang and Li, Lei and Zhou, Yang and Liu, Wentao and Qian, Chen and Ji, Xiangyang}, + date = {2024-08-07}, + title = {CAS-ViT: Convolutional Additive Self-attention Vision Transformers for Efficient Mobile Applications}, + doi = {10.48550/ARXIV.2408.03703}, + eprint = {2408.03703}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Zhang2024c - CAS ViT_ Convolutional Additive Self Attention Vision Transformers for Efficient Mobile Applications.pdf:PDF:http\://arxiv.org/pdf/2408.03703v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Alper2024, + author = {Alper, Morris and Averbuch-Elor, Hadar}, + date = {2024-07-11}, + title = {Emergent Visual-Semantic Hierarchies in Image-Text Representations}, + doi = {10.48550/ARXIV.2407.08521}, + eprint = {2407.08521}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Alper2024 - Emergent Visual Semantic Hierarchies in Image Text Representations.pdf:PDF:http\://arxiv.org/pdf/2407.08521v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Jie2024, + author = {Jie, Shibo and Tang, Yehui and Guo, Jianyuan and Deng, Zhi-Hong and Han, Kai and Wang, Yunhe}, + date = {2024-08-13}, + title = {Token Compensator: Altering Inference Cost of Vision Transformer without Re-Tuning}, + doi = {10.48550/ARXIV.2408.06798}, + eprint = {2408.06798}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Zero v1.0 Universal}, + file = {:Jie2024 - Token Compensator_ Altering Inference Cost of Vision Transformer without Re Tuning.pdf:PDF:http\://arxiv.org/pdf/2408.06798v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Misc{Ranzinger2023, + author = {Ranzinger, Mike and Heinrich, Greg and Kautz, Jan and Molchanov, Pavlo}, + date = {2023-12-10}, + title = {AM-RADIO: Agglomerative Vision Foundation Model -- Reduce All Domains Into One}, + doi = {10.48550/ARXIV.2312.06709}, + eprint = {2312.06709}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + abstract = {efficient backbone, we evaluated numerous architectures in}, + booktitle = {CVPR 2024 Conference Paper}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Ranzinger2023 - AM RADIO_ Agglomerative Vision Foundation Model Reduce All Domains into One.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2023}, +} + +@InProceedings{Zoph2020, + author = {Zoph, Barret and Ghiasi, Golnaz and Lin, Tsung-Yi and Cui, Yin and Liu, Hanxiao and Cubuk, Ekin Dogus and Le, Quoc}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {Rethinking Pre-training and Self-training}, + editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin}, + pages = {3833--3845}, + publisher = {Curran Associates, Inc.}, + url = {https://proceedings.neurips.cc/paper_files/paper/2020/file/27e9661e033a73a6ad8cefcde965c54d-Paper.pdf}, + volume = {33}, + file = {:Zoph2020 - Rethinking Pre Training and Self Training.pdf:PDF}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@InProceedings{Pham2021, + author = {Pham, Hieu and Dai, Zihang and Xie, Qizhe and Le, Quoc V.}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title = {Meta Pseudo Labels}, + pages = {11557-11568}, + file = {:Pham2021 - Meta Pseudo Labels.pdf:PDF}, + month = {June}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@InProceedings{Raghu2021, + author = {Raghu, Aniruddh and Lorraine, Jonathan and Kornblith, Simon and McDermott, Matthew and Duvenaud, David K}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {Meta-learning to Improve Pre-training}, + editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan}, + pages = {23231--23244}, + publisher = {Curran Associates, Inc.}, + url = {https://proceedings.neurips.cc/paper_files/paper/2021/file/c3810d4a9513b028fc0f2a83cb6d7b50-Paper.pdf}, + volume = {34}, + file = {:Raghu2021 - Meta Learning to Improve Pre Training.pdf:PDF}, + priority = {prio2}, + year = {2021}, +} + +@Article{Kage2024, + author = {Kage, Patrick and Rothenberger, Jay C. and Andreadis, Pavlos and Diochnos, Dimitrios I.}, + date = {2024-08-13}, + title = {A Review of Pseudo-Labeling for Computer Vision}, + doi = {10.48550/ARXIV.2408.07221}, + eprint = {2408.07221}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Kage2024 - A Review of Pseudo Labeling for Computer Vision.pdf:PDF:http\://arxiv.org/pdf/2408.07221v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences, I.2.0; I.5.4; I.4.0}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Xu2024, + author = {Moucheng Xu and Yukun Zhou and Chen Jin and Marius {de Groot} and Daniel C. Alexander and Neil P. Oxtoby and Yipeng Hu and Joseph Jacob}, + title = {Expectation maximisation pseudo labels}, + doi = {https://doi.org/10.1016/j.media.2024.103125}, + issn = {1361-8415}, + pages = {103125}, + url = {https://www.sciencedirect.com/science/article/pii/S1361841524000501}, + volume = {94}, + file = {:Xu2024 - Expectation Maximisation Pseudo Labels.pdf:PDF}, + journal = {Medical Image Analysis}, + keywords = {Pseudo labels, Bayesian deep learning, Expectation–maximisation, Semi-supervised learning, Segmentation, Generative models, Robustness}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Jin2022, + author = {Jin, Zezhong and Zhong, Dading and Song, Xiao and Liu, Zhaoyi and Ye, Naipeng and Zeng, Qingcheng}, + date = {2022-10-28}, + title = {Filter and evolve: progressive pseudo label refining for semi-supervised automatic speech recognition}, + doi = {10.48550/ARXIV.2210.16318}, + eprint = {2210.16318}, + eprintclass = {cs.SD}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Jin2022 - Filter and Evolve_ Progressive Pseudo Label Refining for Semi Supervised Automatic Speech Recognition.pdf:PDF:http\://arxiv.org/pdf/2210.16318v1}, + keywords = {Sound (cs.SD), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@Article{Rothenberger2023, + author = {Rothenberger, Jay C. and Diochnos, Dimitrios I.}, + date = {2023-11-29}, + title = {Meta Co-Training: Two Views are Better than One}, + doi = {10.48550/ARXIV.2311.18083}, + eprint = {2311.18083}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Rothenberger2023 - Meta Co Training_ Two Views Are Better Than One.pdf:PDF:http\://arxiv.org/pdf/2311.18083v4}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences, I.2.6; I.4.10}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Plested2022, + author = {Plested, Jo and Gedeon, Tom}, + date = {2022-05-20}, + title = {Deep transfer learning for image classification: a survey}, + doi = {10.48550/ARXIV.2205.09904}, + eprint = {2205.09904}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Plested2022 - Deep Transfer Learning for Image Classification_ a Survey.pdf:PDF:http\://arxiv.org/pdf/2205.09904v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Nguyen2024a, + author = {Nguyen, Khanh-Binh and Park, Chae Jung}, + date = {2024-08-23}, + title = {Symmetric masking strategy enhances the performance of Masked Image Modeling}, + doi = {10.48550/ARXIV.2408.12772}, + eprint = {2408.12772}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Nguyen2024a - Symmetric Masking Strategy Enhances the Performance of Masked Image Modeling.pdf:PDF:http\://arxiv.org/pdf/2408.12772v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Khan2024, + author = {Khan, Asifullah and Sohail, Anabia and Fiaz, Mustansar and Hassan, Mehdi and Afridi, Tariq Habib and Marwat, Sibghat Ullah and Munir, Farzeen and Ali, Safdar and Naseem, Hannan and Zaheer, Muhammad Zaigham and Ali, Kamran and Sultana, Tangina and Tanoli, Ziaurrehman and Akhter, Naeem}, + date = {2024-08-30}, + title = {A Survey of the Self Supervised Learning Mechanisms for Vision Transformers}, + doi = {10.48550/ARXIV.2408.17059}, + eprint = {2408.17059}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Khan2024 - A Survey of the Self Supervised Learning Mechanisms for Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2408.17059v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Wang2024c, + author = {Wang, Zhicai and Wei, Longhui and Wang, Tan and Chen, Heyu and Hao, Yanbin and Wang, Xiang and He, Xiangnan and Tian, Qi}, + date = {2024-03-28}, + title = {Enhance Image Classification via Inter-Class Image Mixup with Diffusion Model}, + doi = {10.48550/ARXIV.2403.19600}, + eprint = {2403.19600}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Zero v1.0 Universal}, + file = {:Wang2024c - Enhance Image Classification Via Inter Class Image Mixup with Diffusion Model.pdf:PDF:http\://arxiv.org/pdf/2403.19600v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Islam2024, + author = {Islam, Khawar and Zaheer, Muhammad Zaigham and Mahmood, Arif and Nandakumar, Karthik}, + date = {2024-04-05}, + title = {DiffuseMix: Label-Preserving Data Augmentation with Diffusion Models}, + doi = {10.48550/ARXIV.2405.14881}, + eprint = {2405.14881}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Islam2024 - DiffuseMix_ Label Preserving Data Augmentation with Diffusion Models.pdf:PDF:http\://arxiv.org/pdf/2405.14881v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Belghazi2018, + author = {Belghazi, Mohamed Ishmael and Rajeswar, Sai and Mastropietro, Olivier and Rostamzadeh, Negar and Mitrovic, Jovana and Courville, Aaron}, + date = {2018-02-04}, + title = {Hierarchical Adversarially Learned Inference}, + doi = {10.48550/ARXIV.1802.01071}, + eprint = {1802.01071}, + eprintclass = {stat.ML}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Belghazi2018 - Hierarchical Adversarially Learned Inference.pdf:PDF:http\://arxiv.org/pdf/1802.01071v1}, + keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2018}, +} + +@Article{Verma2018, + author = {Verma, Vikas and Lamb, Alex and Beckham, Christopher and Najafi, Amir and Mitliagkas, Ioannis and Courville, Aaron and Lopez-Paz, David and Bengio, Yoshua}, + date = {2018-06-13}, + title = {Manifold Mixup: Better Representations by Interpolating Hidden States}, + doi = {10.48550/ARXIV.1806.05236}, + eprint = {1806.05236}, + eprintclass = {stat.ML}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Verma2018 - Manifold Mixup_ Better Representations by Interpolating Hidden States.pdf:PDF:http\://arxiv.org/pdf/1806.05236v7}, + keywords = {Machine Learning (stat.ML), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2018}, +} + +@Article{Shen2020, + author = {Shen, Zhiqiang and Liu, Zechun and Liu, Zhuang and Savvides, Marios and Darrell, Trevor and Xing, Eric}, + date = {2020-03-11}, + title = {Un-Mix: Rethinking Image Mixtures for Unsupervised Visual Representation Learning}, + doi = {10.48550/ARXIV.2003.05438}, + eprint = {2003.05438}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Shen2020 - Un Mix_ Rethinking Image Mixtures for Unsupervised Visual Representation Learning.pdf:PDF:http\://arxiv.org/pdf/2003.05438v5}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + priority = {prio2}, + publisher = {arXiv}, + year = {2020}, +} + +@InProceedings{Beckham2019, + author = {Beckham, Christopher and Honari, Sina and Verma, Vikas and Lamb, Alex M and Ghadiri, Farnoosh and Hjelm, R Devon and Bengio, Yoshua and Pal, Chris}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {On Adversarial Mixup Resynthesis}, + editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett}, + publisher = {Curran Associates, Inc.}, + url = {https://proceedings.neurips.cc/paper_files/paper/2019/file/f708f064faaf32a43e4d3c784e6af9ea-Paper.pdf}, + volume = {32}, + file = {:Beckham2019 - On Adversarial Mixup Resynthesis.pdf:PDF}, + priority = {prio2}, + year = {2019}, +} + +@Article{Wang2024d, + author = {Wang, Yiheng and Lin, Jiayu and Lin, Zuoquan}, + date = {2024-09-04}, + title = {A Comparative Study of Pre-training and Self-training}, + doi = {10.48550/ARXIV.2409.02751}, + eprint = {2409.02751}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Wang2024d - A Comparative Study of Pre Training and Self Training.pdf:PDF:http\://arxiv.org/pdf/2409.02751v1}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Knyazev2024, + author = {Knyazev, Boris and Moudgil, Abhinav and Lajoie, Guillaume and Belilovsky, Eugene and Lacoste-Julien, Simon}, + date = {2024-09-06}, + title = {Accelerating Training with Neuron Interaction and Nowcasting Networks}, + doi = {10.48550/ARXIV.2409.04434}, + eprint = {2409.04434}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Knyazev2024 - Accelerating Training with Neuron Interaction and Nowcasting Networks.pdf:PDF:http\://arxiv.org/pdf/2409.04434v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Machine Learning (stat.ML), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Ramapuram2024, + author = {Ramapuram, Jason and Danieli, Federico and Dhekane, Eeshan and Weers, Floris and Busbridge, Dan and Ablin, Pierre and Likhomanenko, Tatiana and Digani, Jagrit and Gu, Zijin and Shidani, Amitis and Webb, Russ}, + date = {2024-09-06}, + title = {Theory, Analysis, and Best Practices for Sigmoid Self-Attention}, + doi = {10.48550/ARXIV.2409.04431}, + eprint = {2409.04431}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Ramapuram2024 - Theory, Analysis, and Best Practices for Sigmoid Self Attention.pdf:PDF:http\://arxiv.org/pdf/2409.04431v1}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Yu2024a, + author = {Yu, Yonghao and Zhao, Dongcheng and Shen, Guobin and Dong, Yiting and Zeng, Yi}, + date = {2024-09-11}, + title = {Brain-Inspired Stepwise Patch Merging for Vision Transformers}, + doi = {10.48550/ARXIV.2409.06963}, + eprint = {2409.06963}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Yu2024a - Brain Inspired Stepwise Patch Merging for Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2409.06963v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@InProceedings{Vo2024, + author = {Xuan-Thuy Vo and Duy-Linh Nguyen and Adri Priadana and Kang-Hyun Jo}, + booktitle = {European Conference on Computer Vision (ECCV)}, + title = {Efficient Vision Transformers with Partial Attention}, + file = {:Vo2024 - Efficient Vision Transformers with Partial Attention.pdf:PDF}, + priority = {prio3}, + year = {2024}, +} + +@InProceedings{Su2024, + author = {Diwei Su and cheng fei and Jianxu Luo}, + booktitle = {European Conference on Computer Vision (ECCV)}, + title = {Removing Rows and Columns of Tokens in Vision Transformer enables Faster Dense Prediction without Retraining}, + file = {:Su2024 - Removing Rows and Columns of Tokens in Vision Transformer Enables Faster Dense Prediction without Retraining.pdf:PDF}, + priority = {prio3}, + year = {2024}, +} + +@InProceedings{Zheng2024, + author = {Kecheng Zheng and Yifei Zhang and Wei Wu and Fan Lu and Shuailei Ma and Xin Jin and Wei Chen and Yujun Shen}, + booktitle = {European Conference on Computer Vision (ECCV)}, + title = {Language-Image Pre-training with Long Captions}, + file = {:Zheng2024 - Language Image Pre Training with Long Captions.pdf:PDF}, + priority = {prio2}, + year = {2024}, +} + +@InProceedings{Li2024b, + author = {Lujun Li and Zimian Wei and Peijie Dong and Wenhan Luo and Wei Xue and Qifeng Liu and Yike Guo}, + booktitle = {European Conference on Computer Vision (ECCV)}, + title = {{AttnZero:} Efficient Attention Discovery for Vision Transformers}, + file = {:Li2024b - AttnZero_ Efficient Attention Discovery for Vision Transformers.pdf:PDF}, + priority = {prio3}, + year = {2024}, +} + +@Article{Oquab2023, + author = {Oquab, Maxime and Darcet, Timothée and Moutakanni, Théo and Vo, Huy and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and El-Nouby, Alaaeldin and Assran, Mahmoud and Ballas, Nicolas and Galuba, Wojciech and Howes, Russell and Huang, Po-Yao and Li, Shang-Wen and Misra, Ishan and Rabbat, Michael and Sharma, Vasu and Synnaeve, Gabriel and Xu, Hu and Jegou, Hervé and Mairal, Julien and Labatut, Patrick and Joulin, Armand and Bojanowski, Piotr}, + date = {2023-04-14}, + title = {DINOv2: Learning Robust Visual Features without Supervision}, + doi = {10.48550/ARXIV.2304.07193}, + eprint = {2304.07193}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Oquab2023 - DINOv2_ Learning Robust Visual Features without Supervision.pdf:PDF:http\://arxiv.org/pdf/2304.07193v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Zadeh2020, + author = {Zadeh, Amir and Liang, Paul Pu and Morency, Louis-Philippe}, + date = {2020-12}, + journaltitle = {Information Fusion}, + title = {Foundations of Multimodal Co-learning}, + doi = {10.1016/j.inffus.2020.06.001}, + issn = {1566-2535}, + pages = {188--193}, + volume = {64}, + file = {:Zadeh2020 - Foundations of Multimodal Co Learning.pdf:PDF}, + priority = {prio1}, + publisher = {Elsevier BV}, +} + +@Article{Li2024c, + author = {Li, Zeyu Michael}, + date = {2024-10-01}, + title = {Using Interleaved Ensemble Unlearning to Keep Backdoors at Bay for Finetuning Vision Transformers}, + doi = {10.48550/ARXIV.2410.01128}, + eprint = {2410.01128}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Li2024c - Using Interleaved Ensemble Unlearning to Keep Backdoors at Bay for Finetuning Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2410.01128v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Leviathan2024, + author = {Leviathan, Yaniv and Kalman, Matan and Matias, Yossi}, + date = {2024-10-03}, + title = {Selective Attention Improves Transformer}, + doi = {10.48550/ARXIV.2410.02703}, + eprint = {2410.02703}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Leviathan2024 - Selective Attention Improves Transformer.pdf:PDF:http\://arxiv.org/pdf/2410.02703v1}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Rahman2024, + author = {Rahman, Md Maklachur and Tutul, Abdullah Aman and Nath, Ankur and Laishram, Lamyanba and Jung, Soon Ki and Hammond, Tracy}, + date = {2024-10-04}, + title = {Mamba in Vision: A Comprehensive Survey of Techniques and Applications}, + doi = {10.48550/ARXIV.2410.03105}, + eprint = {2410.03105}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + abstract = {Mamba is emerging as a novel approach to overcome the challenges faced by Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) in computer vision. While CNNs excel at extracting local features, they often struggle to capture long-range dependencies without complex architectural modifications. In contrast, ViTs effectively model global relationships but suffer from high computational costs due to the quadratic complexity of their self-attention mechanisms. Mamba addresses these limitations by leveraging Selective Structured State Space Models to effectively capture long-range dependencies with linear computational complexity. This survey analyzes the unique contributions, computational benefits, and applications of Mamba models while also identifying challenges and potential future research directions. We provide a foundational resource for advancing the understanding and growth of Mamba models in computer vision. An overview of this work is available at https://github.com/maklachur/Mamba-in-Computer-Vision.}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Rahman2024 - Mamba in Vision_ a Comprehensive Survey of Techniques and Applications.pdf:PDF:http\://arxiv.org/pdf/2410.03105v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Liu2024, + author = {Liu, Ziming and Wang, Yixuan and Vaidya, Sachin and Ruehle, Fabian and Halverson, James and Soljačić, Marin and Hou, Thomas Y. and Tegmark, Max}, + date = {2024-04-30}, + title = {KAN: Kolmogorov-Arnold Networks}, + doi = {10.48550/ARXIV.2404.19756}, + eprint = {2404.19756}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Liu2024 - KAN_ Kolmogorov Arnold Networks.pdf:PDF:http\://arxiv.org/pdf/2404.19756v4}, + keywords = {Machine Learning (cs.LG), Disordered Systems and Neural Networks (cond-mat.dis-nn), Artificial Intelligence (cs.AI), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Physical sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Feng2024, + author = {Feng, Leo and Tung, Frederick and Ahmed, Mohamed Osama and Bengio, Yoshua and Hajimirsadegh, Hossein}, + date = {2024-10-02}, + title = {Were RNNs All We Needed?}, + doi = {10.48550/ARXIV.2410.01201}, + eprint = {2410.01201}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Feng2024 - Were RNNs All We Needed_.pdf:PDF:http\://arxiv.org/pdf/2410.01201v2}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Yun2021, + author = {Yun, Sangdoo and Oh, Seong Joon and Heo, Byeongho and Han, Dongyoon and Choe, Junsuk and Chun, Sanghyuk}, + date = {2021-01-13}, + title = {Re-labeling ImageNet: from Single to Multi-Labels, from Global to Localized Labels}, + doi = {10.48550/ARXIV.2101.05022}, + eprint = {2101.05022}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Share Alike 4.0 International}, + file = {:Yun2021 - Re Labeling ImageNet_ from Single to Multi Labels, from Global to Localized Labels.pdf:PDF:http\://arxiv.org/pdf/2101.05022v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Wang2024e, + author = {Wang, Junxuan and Ge, Xuyang and Shu, Wentao and Tang, Qiong and Zhou, Yunhua and He, Zhengfu and Qiu, Xipeng}, + date = {2024-10-09}, + title = {Towards Universality: Studying Mechanistic Similarity Across Language Model Architectures}, + doi = {10.48550/ARXIV.2410.06672}, + eprint = {2410.06672}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Share Alike 4.0 International}, + file = {:Wang2024e - Towards Universality_ Studying Mechanistic Similarity across Language Model Architectures.pdf:PDF:http\://arxiv.org/pdf/2410.06672v2}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Gavrikov2024, + author = {Gavrikov, Paul and Agnihotri, Shashank and Keuper, Margret and Keuper, Janis}, + date = {2024-10-18}, + title = {How Do Training Methods Influence the Utilization of Vision Models?}, + doi = {10.48550/ARXIV.2410.14470}, + eprint = {2410.14470}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Gavrikov2024 - How Do Training Methods Influence the Utilization of Vision Models_.pdf:PDF:http\://arxiv.org/pdf/2410.14470v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Assran2023, + author = {Assran, Mahmoud and Duval, Quentin and Misra, Ishan and Bojanowski, Piotr and Vincent, Pascal and Rabbat, Michael and LeCun, Yann and Ballas, Nicolas}, + date = {2023-01-19}, + title = {Self-Supervised Learning from Images with a Joint-Embedding Predictive Architecture}, + doi = {10.48550/ARXIV.2301.08243}, + eprint = {2301.08243}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Assran2023 - Self Supervised Learning from Images with a Joint Embedding Predictive Architecture.pdf:PDF}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + priority = {prio2}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Kaul2024, + author = {Kaul, Prannay and Ma, Chengcheng and Elezi, Ismail and Deng, Jiankang}, + date = {2024-10-22}, + title = {From Attention to Activation: Unravelling the Enigmas of Large Language Models}, + doi = {10.48550/ARXIV.2410.17174}, + eprint = {2410.17174}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Kaul2024 - From Attention to Activation_ Unravelling the Enigmas of Large Language Models.pdf:PDF:http\://arxiv.org/pdf/2410.17174v1}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Saratchandran2024, + author = {Saratchandran, Hemanth and Zheng, Jianqiao and Ji, Yiping and Zhang, Wenbo and Lucey, Simon}, + date = {2024-10-24}, + title = {Rethinking Softmax: Self-Attention with Polynomial Activations}, + doi = {10.48550/ARXIV.2410.18613}, + eprint = {2410.18613}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Saratchandran2024 - Rethinking Softmax_ Self Attention with Polynomial Activations.pdf:PDF:http\://arxiv.org/pdf/2410.18613v1}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (stat.ML), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Docherty2024, + author = {Docherty, Ronan and Vamvakeros, Antonis and Cooper, Samuel J.}, + date = {2024-10-20}, + title = {Upsampling DINOv2 features for unsupervised vision tasks and weakly supervised materials segmentation}, + doi = {10.48550/ARXIV.2410.19836}, + eprint = {2410.19836}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Docherty2024 - Upsampling DINOv2 Features for Unsupervised Vision Tasks and Weakly Supervised Materials Segmentation.pdf:PDF:http\://arxiv.org/pdf/2410.19836v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Materials Science (cond-mat.mtrl-sci), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Physical sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@InProceedings{Trabucco2024, + author = {Brandon Trabucco and Kyle Doherty and Max A Gurinas and Ruslan Salakhutdinov}, + booktitle = {The Twelfth International Conference on Learning Representations}, + title = {Effective Data Augmentation With Diffusion Models}, + url = {https://openreview.net/forum?id=ZWzUA9zeAg}, + file = {:Trabucco2024 - Effective Data Augmentation with Diffusion Models.pdf:PDF}, + readstatus = {read}, + year = {2024}, +} + +@Article{Xiao2020, + author = {Xiao, Kai and Engstrom, Logan and Ilyas, Andrew and Madry, Aleksander}, + date = {2020-06-17}, + title = {Noise or Signal: The Role of Image Backgrounds in Object Recognition}, + doi = {10.48550/ARXIV.2006.09994}, + eprint = {2006.09994}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Xiao2020 - Noise or Signal_ the Role of Image Backgrounds in Object Recognition.pdf:PDF:http\://arxiv.org/pdf/2006.09994v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + readstatus = {read}, + year = {2020}, +} + +@Article{Liang2023, + author = {Liang, Junhui and Liu, Ying and Vlassov, Vladimir}, + date = {2023-08-18}, + title = {The Impact of Background Removal on Performance of Neural Networks for Fashion Image Classification and Segmentation}, + doi = {10.1109/csce60160.2023.00323}, + eprint = {2308.09764}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + pages = {1960--1968}, + booktitle = {2023 Congress in Computer Science, Computer Engineering, &amp; Applied Computing (CSCE)}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Liang2023 - The Impact of Background Removal on Performance of Neural Networks for Fashion Image Classification and Segmentation.pdf:PDF:http\://arxiv.org/pdf/2308.09764v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + month = jul, + priority = {prio1}, + publisher = {IEEE}, + year = {2023}, +} + +@Article{Akhmedova2024, + author = {Akhmedova, Shakhnaz and Körber, Nils}, + date = {2024-04-19}, + title = {Next Generation Loss Function for Image Classification}, + doi = {10.48550/ARXIV.2404.12948}, + eprint = {2404.12948}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Akhmedova2024 - Next Generation Loss Function for Image Classification.pdf:PDF:http\://arxiv.org/pdf/2404.12948v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences}, + publisher = {arXiv}, + readstatus = {read}, + year = {2024}, +} + +@Article{Gonzalez2019, + author = {Gonzalez, Santiago and Miikkulainen, Risto}, + date = {2019-05-27}, + journaltitle = {Proceedings of the 2020 IEEE Congress on Evolutionary Computation}, + title = {Improved Training Speed, Accuracy, and Data Utilization Through Loss Function Optimization}, + doi = {10.48550/ARXIV.1905.11528}, + eprint = {1905.11528}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Gonzalez2019 - Improved Training Speed, Accuracy, and Data Utilization through Loss Function Optimization.pdf:PDF:http\://arxiv.org/pdf/1905.11528v3}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), Neural and Evolutionary Computing (cs.NE), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + readstatus = {read}, + year = {2019}, +} + +@Misc{Islam2021, + author = {Md Amirul Islam and Matthew Kowal and Sen Jia and Konstantinos G. Derpanis and Neil Bruce}, + title = {Boundary Effects in {\{}CNN{\}}s: Feature or Bug?}, + url = {https://openreview.net/forum?id=M4qXqdw3xC}, + file = {:Islam2021 - Boundary Effects in CNN_s_ Feature or Bug_.pdf:PDF}, + priority = {prio1}, + year = {2021}, +} + +@Article{Islam2020, + author = {Islam, Md Amirul and Jia, Sen and Bruce, Neil D. B.}, + date = {2020-01-22}, + title = {How Much Position Information Do Convolutional Neural Networks Encode?}, + doi = {10.48550/ARXIV.2001.08248}, + eprint = {2001.08248}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Islam2020 - How Much Position Information Do Convolutional Neural Networks Encode_.pdf:PDF:http\://arxiv.org/pdf/2001.08248v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2020}, +} + +@InProceedings{Islam2022a, + author = {Islam, Md Amirul and Kowal, Matthew and Esser, Patrick and Ommer, Bj{\"o}rn and Derpanis, Konstantinos G and Bruce, Neil DB and Runway, ML}, + booktitle = {BMVC}, + title = {Maximizing Mutual Shape Information.}, + pages = {909}, + file = {:Islam2022a - Maximizing Mutual Shape Information..pdf:PDF}, + priority = {prio1}, + year = {2022}, +} + +@Article{Zhang2024d, + author = {Zhang, Tianyi and Li, Baoxin and Seo, Jae-sun and Cao, Yu}, + date = {2024-10-31}, + title = {Context-Aware Token Selection and Packing for Enhanced Vision Transformer}, + doi = {10.48550/ARXIV.2410.23608}, + eprint = {2410.23608}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Zhang2024d - Context Aware Token Selection and Packing for Enhanced Vision Transformer.pdf:PDF:http\://arxiv.org/pdf/2410.23608v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Kobayashi2024, + author = {Kobayashi, Seijin and Akram, Yassir and Von Oswald, Johannes}, + date = {2024-10-31}, + title = {Weight decay induces low-rank attention layers}, + doi = {10.48550/ARXIV.2410.23819}, + eprint = {2410.23819}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Kobayashi2024 - Weight Decay Induces Low Rank Attention Layers.pdf:PDF:http\://arxiv.org/pdf/2410.23819v1}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Arya2024, + author = {Arya, Shreyash and Rao, Sukrut and Böhle, Moritz and Schiele, Bernt}, + date = {2024-11-01}, + title = {B-cosification: Transforming Deep Neural Networks to be Inherently Interpretable}, + doi = {10.48550/ARXIV.2411.00715}, + eprint = {2411.00715}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Arya2024 - B Cosification_ Transforming Deep Neural Networks to Be Inherently Interpretable.pdf:PDF:http\://arxiv.org/pdf/2411.00715v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Luo2024, + author = {Luo, Xiangzhong and Liu, Di and Kong, Hao and Huai, Shuo and Chen, Hui and Xiong, Guochu and Liu, Weichen}, + date = {2024-11-03}, + title = {Efficient Deep Learning Infrastructures for Embedded Computing Systems: A Comprehensive Survey and Future Envision}, + doi = {10.48550/ARXIV.2411.01431}, + eprint = {2411.01431}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:http\://arxiv.org/pdf/2411.01431v1:PDF}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Huang2024a, + author = {Huang, Zilong and Ye, Qinghao and Kang, Bingyi and Feng, Jiashi and Fan, Haoqi}, + date = {2024-11-05}, + title = {Classification Done Right for Vision-Language Pre-Training}, + doi = {10.48550/ARXIV.2411.03313}, + eprint = {2411.03313}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Huang2024a - Classification Done Right for Vision Language Pre Training.pdf:PDF:http\://arxiv.org/pdf/2411.03313v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Li2024d, + author = {Li, Kevin Y. and Goyal, Sachin and Semedo, Joao D. and Kolter, J. Zico}, + date = {2024-11-05}, + title = {Inference Optimal VLMs Need Only One Visual Token but Larger Models}, + doi = {10.48550/ARXIV.2411.03312}, + eprint = {2411.03312}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Li2024d - Inference Optimal VLMs Need Only One Visual Token but Larger Models.pdf:PDF:http\://arxiv.org/pdf/2411.03312v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Ye2024, + author = {Ye, Wenqian and Zheng, Guangtao and Cao, Xu and Ma, Yunsheng and Zhang, Aidong}, + date = {2024-02-20}, + title = {Spurious Correlations in Machine Learning: A Survey}, + doi = {10.48550/ARXIV.2402.12715}, + eprint = {2402.12715}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Ye2024 - Spurious Correlations in Machine Learning_ a Survey.pdf:PDF:http\://arxiv.org/pdf/2402.12715v2}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + readstatus = {read}, + year = {2024}, +} + +@Article{Ponkshe2024, + author = {Ponkshe, Kaustubh and Singhal, Raghav and Gorbunov, Eduard and Tumanov, Alexey and Horvath, Samuel and Vepakomma, Praneeth}, + date = {2024-11-29}, + title = {Initialization using Update Approximation is a Silver Bullet for Extremely Efficient Low-Rank Fine-Tuning}, + doi = {10.48550/ARXIV.2411.19557}, + eprint = {2411.19557}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Ponkshe2024 - Initialization Using Update Approximation Is a Silver Bullet for Extremely Efficient Low Rank Fine Tuning.pdf:PDF:http\://arxiv.org/pdf/2411.19557v1}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Halbe2024, + author = {Halbe, Shaunak and Tian, Junjiao and Joseph, K J and Smith, James Seale and Stevo, Katherine and Balasubramanian, Vineeth N and Kira, Zsolt}, + date = {2024-12-05}, + title = {Grounding Descriptions in Images informs Zero-Shot Visual Recognition}, + doi = {10.48550/ARXIV.2412.04429}, + eprint = {2412.04429}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Halbe2024 - Grounding Descriptions in Images Informs Zero Shot Visual Recognition.pdf:PDF:http\://arxiv.org/pdf/2412.04429v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Lin2024, + author = {Lin, Zhenghao and Gou, Zhibin and Gong, Yeyun and Liu, Xiao and Shen, Yelong and Xu, Ruochen and Lin, Chen and Yang, Yujiu and Jiao, Jian and Duan, Nan and Chen, Weizhu}, + date = {2024-04-11}, + title = {Rho-1: Not All Tokens Are What You Need}, + doi = {10.48550/ARXIV.2404.07965}, + eprint = {2404.07965}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Lin2024 - Rho 1_ Not All Tokens Are What You Need.pdf:PDF:http\://arxiv.org/pdf/2404.07965v3}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Zhang2024e, + author = {Zhang, Le and Yang, Qian and Agrawal, Aishwarya}, + date = {2024-12-05}, + title = {Assessing and Learning Alignment of Unimodal Vision and Language Models}, + doi = {10.48550/ARXIV.2412.04616}, + eprint = {2412.04616}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Zhang2024e - Assessing and Learning Alignment of Unimodal Vision and Language Models.pdf:PDF:http\://arxiv.org/pdf/2412.04616v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Lew2024, + author = {Lew, Jaihyun and Jang, Soohyuk and Lee, Jaehoon and Yoo, Seungryong and Kim, Eunji and Lee, Saehyung and Mok, Jisoo and Kim, Siwon and Yoon, Sungroh}, + date = {2024-12-06}, + title = {Superpixel Tokenization for Vision Transformers: Preserving Semantic Integrity in Visual Tokens}, + doi = {10.48550/ARXIV.2412.04680}, + eprint = {2412.04680}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Lew2024 - Superpixel Tokenization for Vision Transformers_ Preserving Semantic Integrity in Visual Tokens.pdf:PDF:http\://arxiv.org/pdf/2412.04680v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Li2024e, + author = {Li, Alexander C. and Tian, Yuandong and Chen, Beidi and Pathak, Deepak and Chen, Xinlei}, + date = {2024-11-14}, + title = {On the Surprising Effectiveness of Attention Transfer for Vision Transformers}, + doi = {10.48550/ARXIV.2411.09702}, + eprint = {2411.09702}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Li2024e - On the Surprising Effectiveness of Attention Transfer for Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2411.09702v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Xu2024a, + author = {Xu, Minghao and Xiang, Lichuan and Cai, Xu and Wen, Hongkai}, + date = {2024-12-16}, + title = {No More Adam: Learning Rate Scaling at Initialization is All You Need}, + doi = {10.48550/ARXIV.2412.11768}, + eprint = {2412.11768}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Xu2024a - No More Adam_ Learning Rate Scaling at Initialization Is All You Need.pdf:PDF:http\://arxiv.org/pdf/2412.11768v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@InProceedings{GontijoLopes2021, + author = {Raphael Gontijo-Lopes and Sylvia Smullin and Ekin Dogus Cubuk and Ethan Dyer}, + booktitle = {International Conference on Learning Representations}, + title = {Tradeoffs in Data Augmentation: An Empirical Study}, + url = {https://openreview.net/forum?id=ZcKPWuhG6wy}, + file = {:/home/tnauen/cloud/JobDFKI/Papers/GontijoLopes2021 - Tradeoffs in Data Augmentation_ an Empirical Study.pdf:PDF}, + readstatus = {read}, + year = {2021}, +} + +@Article{Guo2024a, + author = {Guo, Qiushan and De Mello, Shalini and Yin, Hongxu and Byeon, Wonmin and Cheung, Ka Chun and Yu, Yizhou and Luo, Ping and Liu, Sifei}, + date = {2024-03-04}, + title = {RegionGPT: Towards Region Understanding Vision Language Model}, + doi = {10.48550/ARXIV.2403.02330}, + eprint = {2403.02330}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Guo2024a - RegionGPT_ Towards Region Understanding Vision Language Model.pdf:PDF:http\://arxiv.org/pdf/2403.02330v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Liu2023c, + author = {Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae}, + date = {2023-04-17}, + title = {Visual Instruction Tuning}, + doi = {10.48550/ARXIV.2304.08485}, + eprint = {2304.08485}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Liu2023c - Visual Instruction Tuning.pdf:PDF:http\://arxiv.org/pdf/2304.08485v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Liu2023d, + author = {Liu, Haotian and Li, Chunyuan and Li, Yuheng and Lee, Yong Jae}, + date = {2023-10-05}, + title = {Improved Baselines with Visual Instruction Tuning}, + doi = {10.48550/ARXIV.2310.03744}, + eprint = {2310.03744}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Liu2023d - Improved Baselines with Visual Instruction Tuning.pdf:PDF:http\://arxiv.org/pdf/2310.03744v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Li2024f, + author = {Li, Zhiyuan and Xia, Tingyu and Chang, Yi and Wu, Yuan}, + date = {2024-12-19}, + title = {A Survey of RWKV}, + doi = {10.48550/ARXIV.2412.14847}, + eprint = {2412.14847}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Li2024f - A Survey of RWKV.pdf:PDF:http\://arxiv.org/pdf/2412.14847v2}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Jose2024, + author = {Jose, Cijo and Moutakanni, Théo and Kang, Dahyun and Baldassarre, Federico and Darcet, Timothée and Xu, Hu and Li, Daniel and Szafraniec, Marc and Ramamonjisoa, Michaël and Oquab, Maxime and Siméoni, Oriane and Vo, Huy V. and Labatut, Patrick and Bojanowski, Piotr}, + date = {2024-12-20}, + title = {DINOv2 Meets Text: A Unified Framework for Image- and Pixel-Level Vision-Language Alignment}, + doi = {10.48550/ARXIV.2412.16334}, + eprint = {2412.16334}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Jose2024 - DINOv2 Meets Text_ a Unified Framework for Image and Pixel Level Vision Language Alignment.pdf:PDF:http\://arxiv.org/pdf/2412.16334v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Yoa2024, + author = {Yoa, Seungdong and Lee, Seungjun and Cho, Hyeseung and Kim, Bumsoo and Lim, Woohyung}, + date = {2024-12-21}, + title = {ImagePiece: Content-aware Re-tokenization for Efficient Image Recognition}, + doi = {10.48550/ARXIV.2412.16491}, + eprint = {2412.16491}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Yoa2024 - ImagePiece_ Content Aware Re Tokenization for Efficient Image Recognition.pdf:PDF:http\://arxiv.org/pdf/2412.16491v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Oko2025, + author = {Oko, Kazusato and Lin, Licong and Cai, Yuhang and Mei, Song}, + date = {2025-01-08}, + title = {A Statistical Theory of Contrastive Pre-training and Multimodal Generative AI}, + doi = {10.48550/ARXIV.2501.04641}, + eprint = {2501.04641}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Oko2025 - A Statistical Theory of Contrastive Pre Training and Multimodal Generative AI.pdf:PDF:http\://arxiv.org/pdf/2501.04641v1}, + groups = {Reading Group Potential}, + keywords = {Machine Learning (cs.LG), Statistics Theory (math.ST), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Mathematics}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Zheng2025, + author = {Zheng, Chuanyang}, + date = {2025-01-27}, + title = {The Linear Attention Resurrection in Vision Transformer}, + doi = {10.48550/ARXIV.2501.16182}, + eprint = {2501.16182}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Zheng2025 - The Linear Attention Resurrection in Vision Transformer.pdf:PDF:http\://arxiv.org/pdf/2501.16182v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Dwibedi2017, + author = {Dwibedi, Debidatta and Misra, Ishan and Hebert, Martial}, + date = {2017-08-04}, + title = {Cut, Paste and Learn: Surprisingly Easy Synthesis for Instance Detection}, + doi = {10.48550/ARXIV.1708.01642}, + eprint = {1708.01642}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Dwibedi2017 - Cut, Paste and Learn_ Surprisingly Easy Synthesis for Instance Detection.pdf:PDF:http\://arxiv.org/pdf/1708.01642v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2017}, +} + +@InProceedings{Hinterstoisser2019, + author = {Hinterstoisser, Stefan and Pauly, Olivier and Heibel, Hauke and Martina, Marek and Bokeloh, Martin}, + booktitle = {2019 IEEE/CVF International Conference on Computer Vision Workshop (ICCVW)}, + title = {An Annotation Saved is an Annotation Earned: Using Fully Synthetic Training for Object Detection}, + doi = {10.1109/ICCVW.2019.00340}, + pages = {2787-2796}, + file = {:Hinterstoisser2019 - An Annotation Saved Is an Annotation Earned_ Using Fully Synthetic Training for Object Detection.pdf:PDF}, + keywords = {Computer vision;Conferences;Synthetic Data;Object Detection;Deep Learning}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2019}, +} + +@Article{Ge2023, + author = {Yunhao Ge and Jiashu Xu and Brian Nlong Zhao and Neel Joshi and Laurent Itti and Vibhav Vineet}, + title = {Beyond Generation: Harnessing Text to Image Models for Object Detection and Segmentation}, + url = {https://api.semanticscholar.org/CorpusID:261697353}, + volume = {abs/2309.05956}, + file = {:Ge2023 - Beyond Generation_ Harnessing Text to Image Models for Object Detection and Segmentation.pdf:PDF}, + journal = {ArXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Werman2021, + author = {Werman, Levi Kassel Michael}, + date = {2021-12-20}, + title = {DeePaste -- Inpainting for Pasting}, + doi = {10.48550/ARXIV.2112.10600}, + eprint = {2112.10600}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Werman2021 - DeePaste Inpainting for Pasting.pdf:PDF:http\://arxiv.org/pdf/2112.10600v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Ghiasi2020, + author = {Ghiasi, Golnaz and Cui, Yin and Srinivas, Aravind and Qian, Rui and Lin, Tsung-Yi and Cubuk, Ekin D. and Le, Quoc V. and Zoph, Barret}, + date = {2020-12-13}, + title = {Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation}, + doi = {10.48550/ARXIV.2012.07177}, + eprint = {2012.07177}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Ghiasi2020 - Simple Copy Paste Is a Strong Data Augmentation Method for Instance Segmentation.pdf:PDF:http\://arxiv.org/pdf/2012.07177v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2020}, +} + +@Article{Cubuk2019, + author = {Cubuk, Ekin D. and Zoph, Barret and Shlens, Jonathon and Le, Quoc V.}, + date = {2019-09-30}, + title = {RandAugment: Practical automated data augmentation with a reduced search space}, + doi = {10.48550/ARXIV.1909.13719}, + eprint = {1909.13719}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Cubuk2019 - RandAugment_ Practical Automated Data Augmentation with a Reduced Search Space.pdf:PDF:http\://arxiv.org/pdf/1909.13719v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2019}, +} + +@Article{Griffin2024, + author = {Griffin, Brent A. and Marks, Jacob and Corso, Jason J.}, + date = {2024-11-22}, + title = {Zero-Shot Coreset Selection: Efficient Pruning for Unlabeled Data}, + doi = {10.48550/ARXIV.2411.15349}, + eprint = {2411.15349}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Griffin2024 - Zero Shot Coreset Selection_ Efficient Pruning for Unlabeled Data.pdf:PDF:http\://arxiv.org/pdf/2411.15349v1}, + groups = {Coreset for FL}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Guo2022, + author = {Guo, Chengcheng and Zhao, Bo and Bai, Yanbing}, + date = {2022-04-18}, + title = {DeepCore: A Comprehensive Library for Coreset Selection in Deep Learning}, + doi = {10.48550/ARXIV.2204.08499}, + eprint = {2204.08499}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Guo2022 - DeepCore_ a Comprehensive Library for Coreset Selection in Deep Learning.pdf:PDF:http\://arxiv.org/pdf/2204.08499v3}, + groups = {Coreset for FL}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2022}, +} + +@InProceedings{Yang2024c, + author = {Yang, Shuo and Cao, Zhe and Guo, Sheng and Zhang, Ruiheng and Luo, Ping and Zhang, Shengping and Nie, Liqiang}, + booktitle = {Proceedings of the 41st International Conference on Machine Learning}, + title = {Mind the Boundary: Coreset Selection via Reconstructing the Decision Boundary}, + editor = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix}, + pages = {55948--55960}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + url = {https://proceedings.mlr.press/v235/yang24b.html}, + volume = {235}, + file = {:Yang2024c - Mind the Boundary_ Coreset Selection Via Reconstructing the Decision Boundary.pdf:PDF}, + groups = {Coreset for FL}, + month = {21--27 Jul}, + pdf = {https://raw.githubusercontent.com/mlresearch/v235/main/assets/yang24b/yang24b.pdf}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Killamsetty2021, + author = {Killamsetty, Krishnateja and Zhao, Xujiang and Chen, Feng and Iyer, Rishabh}, + date = {2021-06-14}, + title = {RETRIEVE: Coreset Selection for Efficient and Robust Semi-Supervised Learning}, + doi = {10.48550/ARXIV.2106.07760}, + eprint = {2106.07760}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Killamsetty2021 - RETRIEVE_ Coreset Selection for Efficient and Robust Semi Supervised Learning.pdf:PDF:http\://arxiv.org/pdf/2106.07760v2}, + groups = {Coreset for FL}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2021}, +} + +@Article{Huang2023, + author = {Huang, Xijie and Liu, Zechun and Liu, Shih-Yang and Cheng, Kwang-Ting}, + date = {2023-06-12}, + title = {Efficient and Robust Quantization-aware Training via Adaptive Coreset Selection}, + doi = {10.48550/ARXIV.2306.07215}, + eprint = {2306.07215}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Huang2023 - Efficient and Robust Quantization Aware Training Via Adaptive Coreset Selection.pdf:PDF:http\://arxiv.org/pdf/2306.07215v3}, + groups = {Coreset for FL}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Cubuk2018, + author = {Cubuk, Ekin D. and Zoph, Barret and Mane, Dandelion and Vasudevan, Vijay and Le, Quoc V.}, + date = {2018-05-24}, + title = {AutoAugment: Learning Augmentation Policies from Data}, + doi = {10.48550/ARXIV.1805.09501}, + eprint = {1805.09501}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Cubuk2018 - AutoAugment_ Learning Augmentation Policies from Data.pdf:PDF:http\://arxiv.org/pdf/1805.09501v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2018}, +} + +@Article{Takahashi2018, + author = {Takahashi, Ryo and Matsubara, Takashi and Uehara, Kuniaki}, + date = {2018-11-22}, + journaltitle = {IEEE Transactions on Circuits and Systems for Video Technology, 2019}, + title = {Data Augmentation using Random Image Cropping and Patching for Deep CNNs}, + doi = {10.1109/tcsvt.2019.2935128}, + eprint = {1811.09030}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + issn = {1558-2205}, + number = {9}, + pages = {2917--2931}, + volume = {30}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Takahashi2018 - Data Augmentation Using Random Image Cropping and Patching for Deep CNNs.pdf:PDF:http\://arxiv.org/pdf/1811.09030v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + month = sep, + publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, + year = {2018}, +} + +@Article{Zhong2017, + author = {Zhong, Zhun and Zheng, Liang and Kang, Guoliang and Li, Shaozi and Yang, Yi}, + date = {2017-08-16}, + title = {Random Erasing Data Augmentation}, + doi = {10.48550/ARXIV.1708.04896}, + eprint = {1708.04896}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhong2017 - Random Erasing Data Augmentation.pdf:PDF:http\://arxiv.org/pdf/1708.04896v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2017}, +} + +@Article{Shorten2019, + author = {Shorten, Connor and Khoshgoftaar, Taghi M.}, + date = {2019-07}, + journaltitle = {Journal of Big Data}, + title = {A survey on Image Data Augmentation for Deep Learning}, + doi = {10.1186/s40537-019-0197-0}, + issn = {2196-1115}, + number = {1}, + volume = {6}, + file = {:Shorten2019 - A Survey on Image Data Augmentation for Deep Learning.pdf:PDF}, + publisher = {Springer Science and Business Media LLC}, + year = {2019}, +} + +@Article{Xu2023d, + author = {Xu, Mingle and Yoon, Sook and Fuentes, Alvaro and Park, Dong Sun}, + date = {2023-05}, + journaltitle = {Pattern Recognition}, + title = {A Comprehensive Survey of Image Augmentation Techniques for Deep Learning}, + doi = {10.1016/j.patcog.2023.109347}, + issn = {0031-3203}, + pages = {109347}, + volume = {137}, + file = {:Xu2023d - A Comprehensive Survey of Image Augmentation Techniques for Deep Learning.pdf:PDF}, + publisher = {Elsevier BV}, + year = {2023}, +} + +@Article{Ling2022, + author = {Ling, Evan and Huang, Dezhao and Hur, Minhoe}, + date = {2022-10-07}, + title = {Humans need not label more humans: Occlusion Copy \& Paste for Occluded Human Instance Segmentation}, + doi = {10.48550/ARXIV.2210.03686}, + eprint = {2210.03686}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Ling2022 - Humans Need Not Label More Humans_ Occlusion Copy & Paste for Occluded Human Instance Segmentation.pdf:PDF:http\://arxiv.org/pdf/2210.03686v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Hendrycks2019, + author = {Hendrycks, Dan and Dietterich, Thomas}, + date = {2019-03-28}, + title = {Benchmarking Neural Network Robustness to Common Corruptions and Perturbations}, + doi = {10.48550/ARXIV.1903.12261}, + eprint = {1903.12261}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Hendrycks2019 - Benchmarking Neural Network Robustness to Common Corruptions and Perturbations.pdf:PDF:http\://arxiv.org/pdf/1903.12261v1}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (stat.ML), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2019}, +} + +@Article{Li2023e, + author = {Li, Xiaodan and Chen, Yuefeng and Zhu, Yao and Wang, Shuhui and Zhang, Rong and Xue, Hui}, + date = {2023-03-30}, + journaltitle = {CVPR 2023}, + title = {ImageNet-E: Benchmarking Neural Network Robustness via Attribute Editing}, + doi = {10.48550/ARXIV.2303.17096}, + eprint = {2303.17096}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Li2023e - ImageNet E_ Benchmarking Neural Network Robustness Via Attribute Editing.pdf:PDF:http\://arxiv.org/pdf/2303.17096v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Zhang2024f, + author = {Zhang, Chenshuang and Pan, Fei and Kim, Junmo and Kweon, In So and Mao, Chengzhi}, + date = {2024-03-27}, + title = {ImageNet-D: Benchmarking Neural Network Robustness on Diffusion Synthetic Object}, + doi = {10.48550/ARXIV.2403.18775}, + eprint = {2403.18775}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhang2024f - ImageNet D_ Benchmarking Neural Network Robustness on Diffusion Synthetic Object.pdf:PDF:http\://arxiv.org/pdf/2403.18775v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Geirhos2018, + author = {Geirhos, Robert and Rubisch, Patricia and Michaelis, Claudio and Bethge, Matthias and Wichmann, Felix A. and Brendel, Wieland}, + date = {2018-11-29}, + title = {ImageNet-trained CNNs are biased towards texture; increasing shape bias improves accuracy and robustness}, + doi = {10.48550/ARXIV.1811.12231}, + eprint = {1811.12231}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Geirhos2018 - ImageNet Trained CNNs Are Biased Towards Texture\; Increasing Shape Bias Improves Accuracy and Robustness.pdf:PDF:http\://arxiv.org/pdf/1811.12231v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Neurons and Cognition (q-bio.NC), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Biological sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2018}, +} + +@Article{Suvorov2021, + author = {Suvorov, Roman and Logacheva, Elizaveta and Mashikhin, Anton and Remizova, Anastasia and Ashukha, Arsenii and Silvestrov, Aleksei and Kong, Naejin and Goka, Harshith and Park, Kiwoong and Lempitsky, Victor}, + date = {2021-09-15}, + title = {Resolution-robust Large Mask Inpainting with Fourier Convolutions}, + doi = {10.48550/ARXIV.2109.07161}, + eprint = {2109.07161}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Suvorov2021 - Resolution Robust Large Mask Inpainting with Fourier Convolutions.pdf:PDF:http\://arxiv.org/pdf/2109.07161v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + publisher = {arXiv}, + year = {2021}, +} + +@Article{Sun2024, + author = {Sun, Wenhao and Cui, Benlei and Dong, Xue-Mei and Tang, Jingqun}, + date = {2024-12-17}, + title = {Attentive Eraser: Unleashing Diffusion Model's Object Removal Potential via Self-Attention Redirection Guidance}, + doi = {10.48550/ARXIV.2412.12974}, + eprint = {2412.12974}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Sun2024 - Attentive Eraser_ Unleashing Diffusion Model's Object Removal Potential Via Self Attention Redirection Guidance.pdf:PDF:http\://arxiv.org/pdf/2412.12974v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Ren2024, + author = {Ren, Tianhe and Liu, Shilong and Zeng, Ailing and Lin, Jing and Li, Kunchang and Cao, He and Chen, Jiayu and Huang, Xinyu and Chen, Yukang and Yan, Feng and Zeng, Zhaoyang and Zhang, Hao and Li, Feng and Yang, Jie and Li, Hongyang and Jiang, Qing and Zhang, Lei}, + date = {2024-01-25}, + title = {Grounded SAM: Assembling Open-World Models for Diverse Visual Tasks}, + doi = {10.48550/ARXIV.2401.14159}, + eprint = {2401.14159}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Ren2024 - Grounded SAM_ Assembling Open World Models for Diverse Visual Tasks.pdf:PDF:http\://arxiv.org/pdf/2401.14159v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Liu2023e, + author = {Liu, Shilong and Zeng, Zhaoyang and Ren, Tianhe and Li, Feng and Zhang, Hao and Yang, Jie and Jiang, Qing and Li, Chunyuan and Yang, Jianwei and Su, Hang and Zhu, Jun and Zhang, Lei}, + date = {2023-03-09}, + title = {Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection}, + doi = {10.48550/ARXIV.2303.05499}, + eprint = {2303.05499}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Liu2023e - Grounding DINO_ Marrying DINO with Grounded Pre Training for Open Set Object Detection.pdf:PDF:http\://arxiv.org/pdf/2303.05499v5}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2023}, +} + +@Article{Kirillov2023, + author = {Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C. and Lo, Wan-Yen and Dollár, Piotr and Girshick, Ross}, + date = {2023-04-05}, + title = {Segment Anything}, + doi = {10.48550/ARXIV.2304.02643}, + eprint = {2304.02643}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Kirillov2023 - Segment Anything.pdf:PDF:http\://arxiv.org/pdf/2304.02643v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Shermaine2025, + author = {Shermaine, Ang Jia Ning and Lazarou, Michalis and Stathaki, Tania}, + date = {2025-02-19}, + title = {Image compositing is all you need for data augmentation}, + doi = {10.48550/ARXIV.2502.13936}, + eprint = {2502.13936}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Shermaine2025 - Image Compositing Is All You Need for Data Augmentation.pdf:PDF:http\://arxiv.org/pdf/2502.13936v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Le2015, + author = {Le, Yann and Yang, Xuan}, + title = {Tiny imagenet visual recognition challenge}, + number = {7}, + pages = {3}, + volume = {7}, + file = {:Le2015 - Tiny Imagenet Visual Recognition Challenge.pdf:PDF}, + journal = {CS 231N}, + year = {2015}, +} + +@Book{Jonhson1995, + author = {Jonhson, Norman L. and Kotz, Samuel and Balakrishnan, N.}, + title = {Continuous Univariate Distributions}, + edition = {2}, + isbn = {0-471-58494-0}, + note = {Wiley series in probability and mathematical statistics}, + publisher = {Wiley}, + series = {Wiley series in probability and mathematical statistics.}, + year = {1995}, +} + +@TechReport{Maji2013, + author = {S. Maji and J. Kannala and E. Rahtu and M. Blaschko and A. Vedaldi}, + title = {Fine-Grained Visual Classification of Aircraft}, + eprint = {1306.5151}, + archiveprefix = {arXiv}, + primaryclass = {cs-cv}, + year = {2013}, +} + +@Article{Dehghan2017, + author = {Dehghan, Afshin and Masood, Syed Zain and Shu, Guang and Ortiz, Enrique. G.}, + date = {2017-02-06}, + title = {View Independent Vehicle Make, Model and Color Recognition Using Convolutional Neural Network}, + doi = {10.48550/ARXIV.1702.01721}, + eprint = {1702.01721}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Dehghan2017 - View Independent Vehicle Make, Model and Color Recognition Using Convolutional Neural Network.pdf:PDF:http\://arxiv.org/pdf/1702.01721v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2017}, +} + +@Article{Kaur2017, + author = {Kaur, Parneet and Sikka, Karan and Divakaran, Ajay}, + date = {2017-12-23}, + title = {Combining Weakly and Webly Supervised Learning for Classifying Food Images}, + doi = {10.48550/ARXIV.1712.08730}, + eprint = {1712.08730}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Kaur2017 - Combining Weakly and Webly Supervised Learning for Classifying Food Images.pdf:PDF:http\://arxiv.org/pdf/1712.08730v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2017}, +} + +@InProceedings{Parkhi2012, + author = {Omkar M. Parkhi and Andrea Vedaldi and Andrew Zisserman and C. V. Jawahar}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition}, + title = {Cats and Dogs}, + year = {2012}, +} + +@Article{Selvaraju2016, + author = {Selvaraju, Ramprasaath R. and Cogswell, Michael and Das, Abhishek and Vedantam, Ramakrishna and Parikh, Devi and Batra, Dhruv}, + date = {2016-10-07}, + journaltitle = {International Journal of Computer Vision}, + title = {Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization}, + doi = {10.1007/s11263-019-01228-7}, + eprint = {1610.02391}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + issn = {1573-1405}, + number = {2}, + pages = {336--359}, + volume = {128}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Selvaraju2016 - Grad CAM_ Visual Explanations from Deep Networks Via Gradient Based Localization.pdf:PDF:http\://arxiv.org/pdf/1610.02391v4}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + month = oct, + publisher = {Springer Science and Business Media LLC}, + year = {2016}, +} + +@InProceedings{Chattopadhay2018, + author = {Chattopadhay, Aditya and Sarkar, Anirban and Howlader, Prantik and Balasubramanian, Vineeth N}, + booktitle = {2018 IEEE Winter Conference on Applications of Computer Vision (WACV)}, + title = {Grad-CAM++: Generalized Gradient-Based Visual Explanations for Deep Convolutional Networks}, + doi = {10.1109/WACV.2018.00097}, + pages = {839-847}, + keywords = {Visualization;Heating systems;Neurons;Machine learning;Predictive models;Mathematical model}, + year = {2018}, +} + +@Article{Sundararajan2017, + author = {Sundararajan, Mukund and Taly, Ankur and Yan, Qiqi}, + date = {2017-03-04}, + title = {Axiomatic Attribution for Deep Networks}, + doi = {10.48550/ARXIV.1703.01365}, + eprint = {1703.01365}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Sundararajan2017 - Axiomatic Attribution for Deep Networks.pdf:PDF:http\://arxiv.org/pdf/1703.01365v2}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2017}, +} + +@Article{Carion2020, + author = {Carion, Nicolas and Massa, Francisco and Synnaeve, Gabriel and Usunier, Nicolas and Kirillov, Alexander and Zagoruyko, Sergey}, + date = {2020-05-26}, + title = {End-to-End Object Detection with Transformers}, + doi = {10.48550/ARXIV.2005.12872}, + eprint = {2005.12872}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Zero v1.0 Universal}, + file = {:Carion2020 - End to End Object Detection with Transformers.pdf:PDF:http\://arxiv.org/pdf/2005.12872v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2020}, +} + +@Article{Zong2022, + author = {Zong, Zhuofan and Song, Guanglu and Liu, Yu}, + date = {2022-11-22}, + title = {DETRs with Collaborative Hybrid Assignments Training}, + doi = {10.48550/ARXIV.2211.12860}, + eprint = {2211.12860}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Zong2022 - DETRs with Collaborative Hybrid Assignments Training.pdf:PDF:http\://arxiv.org/pdf/2211.12860v6}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Wang2022a, + author = {Wang, Wenhui and Bao, Hangbo and Dong, Li and Bjorck, Johan and Peng, Zhiliang and Liu, Qiang and Aggarwal, Kriti and Mohammed, Owais Khan and Singhal, Saksham and Som, Subhojit and Wei, Furu}, + date = {2022-08-22}, + title = {Image as a Foreign Language: BEiT Pretraining for All Vision and Vision-Language Tasks}, + doi = {10.48550/ARXIV.2208.10442}, + eprint = {2208.10442}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Wang2022a - Image As a Foreign Language_ BEiT Pretraining for All Vision and Vision Language Tasks.pdf:PDF:http\://arxiv.org/pdf/2208.10442v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Liu2022d, + author = {Liu, Yue and Matsoukas, Christos and Strand, Fredrik and Azizpour, Hossein and Smith, Kevin}, + date = {2022-08-10}, + title = {PatchDropout: Economizing Vision Transformers Using Patch Dropout}, + doi = {10.48550/ARXIV.2208.07220}, + eprint = {2208.07220}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Liu2022d - PatchDropout_ Economizing Vision Transformers Using Patch Dropout.pdf:PDF:http\://arxiv.org/pdf/2208.07220v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{He2017, + author = {He, Kaiming and Gkioxari, Georgia and Dollár, Piotr and Girshick, Ross}, + date = {2017-03-20}, + title = {Mask R-CNN}, + doi = {10.48550/ARXIV.1703.06870}, + eprint = {1703.06870}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:He2017 - Mask R CNN.pdf:PDF:http\://arxiv.org/pdf/1703.06870v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2017}, +} + +@InBook{Sanderson2022, + author = {Sanderson, Edward and Matuszewski, Bogdan J.}, + booktitle = {Medical Image Understanding and Analysis}, + date = {2022}, + title = {FCN-Transformer Feature Fusion for Polyp Segmentation}, + doi = {10.1007/978-3-031-12053-4_65}, + isbn = {9783031120534}, + pages = {892--907}, + publisher = {Springer International Publishing}, + issn = {1611-3349}, + year = {2022}, +} + +@Article{Vezakis2024, + author = {Vezakis, Ioannis A. and Georgas, Konstantinos and Fotiadis, Dimitrios and Matsopoulos, George K.}, + date = {2024-07-23}, + title = {EffiSegNet: Gastrointestinal Polyp Segmentation through a Pre-Trained EfficientNet-based Network with a Simplified Decoder}, + doi = {10.48550/ARXIV.2407.16298}, + eprint = {2407.16298}, + eprintclass = {eess.IV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Vezakis2024 - EffiSegNet_ Gastrointestinal Polyp Segmentation through a Pre Trained EfficientNet Based Network with a Simplified Decoder.pdf:PDF:http\://arxiv.org/pdf/2407.16298v1}, + keywords = {Image and Video Processing (eess.IV), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Wang2022b, + author = {Wang, Wenhai and Dai, Jifeng and Chen, Zhe and Huang, Zhenhang and Li, Zhiqi and Zhu, Xizhou and Hu, Xiaowei and Lu, Tong and Lu, Lewei and Li, Hongsheng and Wang, Xiaogang and Qiao, Yu}, + date = {2022-11-10}, + title = {InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions}, + doi = {10.48550/ARXIV.2211.05778}, + eprint = {2211.05778}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Wang2022b - InternImage_ Exploring Large Scale Vision Foundation Models with Deformable Convolutions.pdf:PDF:http\://arxiv.org/pdf/2211.05778v4}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Girshick2013, + author = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra}, + date = {2013-11-11}, + title = {Rich feature hierarchies for accurate object detection and semantic segmentation}, + doi = {10.48550/ARXIV.1311.2524}, + eprint = {1311.2524}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Girshick2013 - Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation.pdf:PDF:http\://arxiv.org/pdf/1311.2524v5}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2013}, +} + +@InProceedings{Krizhevsky2012, + author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {ImageNet Classification with Deep Convolutional Neural Networks}, + editor = {F. Pereira and C.J. Burges and L. Bottou and K.Q. Weinberger}, + publisher = {Curran Associates, Inc.}, + url = {https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf}, + volume = {25}, + year = {2012}, +} + +@Article{Rangel2024, + author = {Rangel, Gabriela and Cuevas-Tello, Juan C. and Nunez-Varela, Jose and Puente, Cesar and Silva-Trujillo, Alejandra G.}, + date = {2024-01}, + journaltitle = {Journal of Sensors}, + title = {A Survey on Convolutional Neural Networks and Their Performance Limitations in Image Recognition Tasks}, + doi = {10.1155/2024/2797320}, + editor = {Feng, Lihang}, + issn = {1687-7268}, + number = {1}, + volume = {2024}, + publisher = {Wiley}, + year = {2024}, +} + +@Article{Alomar2023, + author = {Alomar, Khaled and Aysel, Halil Ibrahim and Cai, Xiaohao}, + date = {2023-02}, + journaltitle = {Journal of Imaging}, + title = {Data Augmentation in Classification and Segmentation: A Survey and New Strategies}, + doi = {10.3390/jimaging9020046}, + issn = {2313-433X}, + number = {2}, + pages = {46}, + volume = {9}, + publisher = {MDPI AG}, + year = {2023}, +} + +@Article{RojasGomez2023, + author = {Rojas-Gomez, Renan A. and Lim, Teck-Yian and Do, Minh N. and Yeh, Raymond A.}, + date = {2023-05-25}, + title = {Making Vision Transformers Truly Shift-Equivariant}, + doi = {10.48550/ARXIV.2305.16316}, + eprint = {2305.16316}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:RojasGomez2023 - Making Vision Transformers Truly Shift Equivariant (1).pdf:PDF:http\://arxiv.org/pdf/2305.16316v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Ding2023a, + author = {Ding, Peijian and Soselia, Davit and Armstrong, Thomas and Su, Jiahao and Huang, Furong}, + date = {2023-06-13}, + title = {Reviving Shift Equivariance in Vision Transformers}, + doi = {10.48550/ARXIV.2306.07470}, + eprint = {2306.07470}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Ding2023a - Reviving Shift Equivariance in Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2306.07470v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Bates1955, + author = {Bates, G.E.}, + title = {Joint distributions of time intervals for the occurrence of successive accidents in a generalized Polya urn scheme}, + pages = {705–720}, + volume = {26}, + journal = {Annals of Mathematical Statistics}, + year = {1955}, +} + +@Article{Adebayo2018, + author = {Adebayo, Julius and Gilmer, Justin and Muelly, Michael and Goodfellow, Ian and Hardt, Moritz and Kim, Been}, + date = {2018-10-08}, + title = {Sanity Checks for Saliency Maps}, + doi = {10.48550/ARXIV.1810.03292}, + eprint = {1810.03292}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Adebayo2018 - Sanity Checks for Saliency Maps.pdf:PDF:http\://arxiv.org/pdf/1810.03292v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2018}, +} + +@Article{Huang2016a, + author = {Huang, Jonathan and Rathod, Vivek and Sun, Chen and Zhu, Menglong and Korattikara, Anoop and Fathi, Alireza and Fischer, Ian and Wojna, Zbigniew and Song, Yang and Guadarrama, Sergio and Murphy, Kevin}, + date = {2016-11-30}, + title = {Speed/accuracy trade-offs for modern convolutional object detectors}, + doi = {10.48550/ARXIV.1611.10012}, + eprint = {1611.10012}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Huang2016a - Speed_accuracy Trade Offs for Modern Convolutional Object Detectors.pdf:PDF:http\://arxiv.org/pdf/1611.10012v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2016}, +} + +@Article{Shen2023, + author = {Shen, Yunhang and Fu, Chaoyou and Chen, Peixian and Zhang, Mengdan and Li, Ke and Sun, Xing and Wu, Yunsheng and Lin, Shaohui and Ji, Rongrong}, + date = {2023-12-04}, + title = {Aligning and Prompting Everything All at Once for Universal Visual Perception}, + doi = {10.48550/ARXIV.2312.02153}, + eprint = {2312.02153}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Shen2023 - Aligning and Prompting Everything All at Once for Universal Visual Perception.pdf:PDF:http\://arxiv.org/pdf/2312.02153v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2023}, +} + +@Article{Sinhamahapatra2024, + author = {Sinhamahapatra, Poulami and Schwaiger, Franziska and Bose, Shirsha and Wang, Huiyu and Roscher, Karsten and Guennemann, Stephan}, + date = {2024-04-11}, + title = {Finding Dino: A Plug-and-Play Framework for Zero-Shot Detection of Out-of-Distribution Objects Using Prototypes}, + doi = {10.48550/ARXIV.2404.07664}, + eprint = {2404.07664}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Sinhamahapatra2024 - Finding Dino_ a Plug and Play Framework for Zero Shot Detection of Out of Distribution Objects Using Prototypes.pdf:PDF:http\://arxiv.org/pdf/2404.07664v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Li2022a, + author = {Li, Feng and Zhang, Hao and xu, Huaizhe and Liu, Shilong and Zhang, Lei and Ni, Lionel M. and Shum, Heung-Yeung}, + date = {2022-06-06}, + title = {Mask DINO: Towards A Unified Transformer-based Framework for Object Detection and Segmentation}, + doi = {10.48550/ARXIV.2206.02777}, + eprint = {2206.02777}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Li2022a - Mask DINO_ Towards a Unified Transformer Based Framework for Object Detection and Segmentation.pdf:PDF:http\://arxiv.org/pdf/2206.02777v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {skimmed}, + year = {2022}, +} + +@Article{Maninis2024, + author = {Maninis, Kevis-Kokitsi and Chen, Kaifeng and Ghosh, Soham and Karpur, Arjun and Chen, Koert and Xia, Ye and Cao, Bingyi and Salz, Daniel and Han, Guangxing and Dlabal, Jan and Gnanapragasam, Dan and Seyedhosseini, Mojtaba and Zhou, Howard and Araujo, Andre}, + date = {2024-10-21}, + title = {TIPS: Text-Image Pretraining with Spatial awareness}, + doi = {10.48550/ARXIV.2410.16512}, + eprint = {2410.16512}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Maninis2024 - TIPS_ Text Image Pretraining with Spatial Awareness.pdf:PDF:http\://arxiv.org/pdf/2410.16512v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Naeem2023, + author = {Naeem, Muhammad Ferjad and Xian, Yongqin and Zhai, Xiaohua and Hoyer, Lukas and Van Gool, Luc and Tombari, Federico}, + date = {2023-10-20}, + title = {SILC: Improving Vision Language Pretraining with Self-Distillation}, + doi = {10.48550/ARXIV.2310.13355}, + eprint = {2310.13355}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Naeem2023 - SILC_ Improving Vision Language Pretraining with Self Distillation.pdf:PDF:http\://arxiv.org/pdf/2310.13355v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Xiao2024, + author = {Xiao, Weiwei and Chen, Yongyong and Shan, Qiben and Wang, Yaowei and Su, Jingyong}, + date = {2024-03}, + journaltitle = {Proceedings of the AAAI Conference on Artificial Intelligence}, + title = {Feature Distribution Matching by Optimal Transport for Effective and Robust Coreset Selection}, + doi = {10.1609/aaai.v38i8.28771}, + issn = {2159-5399}, + number = {8}, + pages = {9196--9204}, + volume = {38}, + file = {:Xiao2024 - Feature Distribution Matching by Optimal Transport for Effective and Robust Coreset Selection.pdf:PDF}, + groups = {Coreset for FL}, + publisher = {Association for the Advancement of Artificial Intelligence (AAAI)}, + qualityassured = {qualityAssured}, + readstatus = {read}, +} + +@InProceedings{Yang2024d, + author = {Yang, Shuo and Cao, Zhe and Guo, Sheng and Zhang, Ruiheng and Luo, Ping and Zhang, Shengping and Nie, Liqiang}, + booktitle = {Proceedings of the 41st International Conference on Machine Learning}, + title = {Mind the Boundary: Coreset Selection via Reconstructing the Decision Boundary}, + editor = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix}, + pages = {55948--55960}, + publisher = {PMLR}, + series = {Proceedings of Machine Learning Research}, + url = {https://proceedings.mlr.press/v235/yang24b.html}, + volume = {235}, + file = {:Yang2024d - Mind the Boundary_ Coreset Selection Via Reconstructing the Decision Boundary.pdf:PDF}, + groups = {Coreset for FL}, + month = {21--27 Jul}, + pdf = {https://raw.githubusercontent.com/mlresearch/v235/main/assets/yang24b/yang24b.pdf}, + year = {2024}, +} + +@Article{Guo2024b, + author = {Guo, Yangyang and Kankanhalli, Mohan}, + date = {2024-11-14}, + title = {SCAN: Bootstrapping Contrastive Pre-training for Data Efficiency}, + doi = {10.48550/ARXIV.2411.09126}, + eprint = {2411.09126}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Guo2024b - SCAN_ Bootstrapping Contrastive Pre Training for Data Efficiency.pdf:PDF:http\://arxiv.org/pdf/2411.09126v1}, + groups = {Coreset for FL}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2024}, +} + +@Article{Nauen2025a, + author = {Nauen, Tobias Christian and Moser, Brian and Raue, Federico and Frolov, Stanislav and Dengel, Andreas}, + date = {2025-03-12}, + title = {ForAug: Recombining Foregrounds and Backgrounds to Improve Vision Transformer Training with Bias Mitigation}, + doi = {10.48550/ARXIV.2503.09399}, + eprint = {2503.09399}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Nauen2025 - ForAug_ Recombining Foregrounds and Backgrounds to Improve Vision Transformer Training with Bias Mitigation.pdf:PDF:http\://arxiv.org/pdf/2503.09399v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, I.2.10; I.2.6; I.4.6, 68T45}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Mehra2025, + author = {Mehra, Akshay and Mittal, Trisha and Gopalakrishnan, Subhadra and Kimball, Joshua}, + date = {2025-02-23}, + title = {Model-agnostic Coreset Selection via LLM-based Concept Bottlenecks}, + doi = {10.48550/ARXIV.2502.16733}, + eprint = {2502.16733}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Mehra2025 - Model Agnostic Coreset Selection Via LLM Based Concept Bottlenecks.pdf:PDF:http\://arxiv.org/pdf/2502.16733v1}, + groups = {Coreset for FL}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + qualityassured = {qualityAssured}, + readstatus = {read}, + year = {2025}, +} + +@Article{Zhu2025, + author = {Zhu, Jiachen and Chen, Xinlei and He, Kaiming and LeCun, Yann and Liu, Zhuang}, + date = {2025-03-13}, + title = {Transformers without Normalization}, + doi = {10.48550/ARXIV.2503.10622}, + eprint = {2503.10622}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhu2025 - Transformers without Normalization.pdf:PDF:http\://arxiv.org/pdf/2503.10622v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Dorszewski2025, + author = {Dorszewski, Teresa and Tětková, Lenka and Jenssen, Robert and Hansen, Lars Kai and Wickstrøm, Kristoffer Knutsen}, + date = {2025-03-31}, + title = {From Colors to Classes: Emergence of Concepts in Vision Transformers}, + doi = {10.48550/ARXIV.2503.24071}, + eprint = {2503.24071}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + abstract = {Vision Transformers (ViTs) are increasingly utilized in various computer vision tasks due to their powerful representation capabilities. However, it remains understudied how ViTs process information layer by layer. Numerous studies have shown that convolutional neural networks (CNNs) extract features of increasing complexity throughout their layers, which is crucial for tasks like domain adaptation and transfer learning. ViTs, lacking the same inductive biases as CNNs, can potentially learn global dependencies from the first layers due to their attention mechanisms. Given the increasing importance of ViTs in computer vision, there is a need to improve the layer-wise understanding of ViTs. In this work, we present a novel, layer-wise analysis of concepts encoded in state-of-the-art ViTs using neuron labeling. Our findings reveal that ViTs encode concepts with increasing complexity throughout the network. Early layers primarily encode basic features such as colors and textures, while later layers represent more specific classes, including objects and animals. As the complexity of encoded concepts increases, the number of concepts represented in each layer also rises, reflecting a more diverse and specific set of features. Additionally, different pretraining strategies influence the quantity and category of encoded concepts, with finetuning to specific downstream tasks generally reducing the number of encoded concepts and shifting the concepts to more relevant categories.}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Dorszewski2025 - From Colors to Classes_ Emergence of Concepts in Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2503.24071v1}, + groups = {Reading Group Potential}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Imam2024, + author = {Imam, Mohamed Fazli and Marew, Rufael Fedaku and Hassan, Jameel and Fiaz, Mustansar and Aji, Alham Fikri and Cholakkal, Hisham}, + date = {2024-11-28}, + title = {CLIP meets DINO for Tuning Zero-Shot Classifier using Unlabeled Image Collections}, + doi = {10.48550/ARXIV.2411.19346}, + eprint = {2411.19346}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Imam2024 - CLIP Meets DINO for Tuning Zero Shot Classifier Using Unlabeled Image Collections.pdf:PDF:http\://arxiv.org/pdf/2411.19346v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Kerssies2025, + author = {Kerssies, Tommie and Cavagnero, Niccolò and Hermans, Alexander and Norouzi, Narges and Averta, Giuseppe and Leibe, Bastian and Dubbelman, Gijs and de Geus, Daan}, + date = {2025-03-24}, + title = {Your ViT is Secretly an Image Segmentation Model}, + doi = {10.48550/ARXIV.2503.19108}, + eprint = {2503.19108}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Kerssies2025 - Your ViT Is Secretly an Image Segmentation Model.pdf:PDF:http\://arxiv.org/pdf/2503.19108v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Somvanshi2025, + author = {Somvanshi, Shriyank and Islam, Md Monzurul and Mimi, Mahmuda Sultana and Polock, Sazzad Bin Bashar and Chhetri, Gaurab and Das, Subasish}, + date = {2025-03-22}, + title = {A Survey on Structured State Space Sequence (S4) Models}, + doi = {10.48550/ARXIV.2503.18970}, + eprint = {2503.18970}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Somvanshi2025 - A Survey on Structured State Space Sequence (S4) Models.pdf:PDF:http\://arxiv.org/pdf/2503.18970v1}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Wang2025, + author = {Wang, Qin and Bruns, Benjamin and Scharr, Hanno and Krajsek, Kai}, + date = {2025-03-24}, + title = {Self-Supervised Learning based on Transformed Image Reconstruction for Equivariance-Coherent Feature Representation}, + doi = {10.48550/ARXIV.2503.18753}, + eprint = {2503.18753}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Wang2025 - Self Supervised Learning Based on Transformed Image Reconstruction for Equivariance Coherent Feature Representation.pdf:PDF:http\://arxiv.org/pdf/2503.18753v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Hesse2025, + author = {Hesse, Robin and Bağcı, Doğukan and Schiele, Bernt and Schaub-Meyer, Simone and Roth, Stefan}, + date = {2025-03-21}, + title = {Beyond Accuracy: What Matters in Designing Well-Behaved Models?}, + doi = {10.48550/ARXIV.2503.17110}, + eprint = {2503.17110}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Hesse2025 - Beyond Accuracy_ What Matters in Designing Well Behaved Models_.pdf:PDF:http\://arxiv.org/pdf/2503.17110v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Tang2025, + author = {Tang, Zineng and Lian, Long and Eisape, Seun and Wang, XuDong and Herzig, Roei and Yala, Adam and Suhr, Alane and Darrell, Trevor and Chan, David M.}, + date = {2025-03-19}, + title = {TULIP: Towards Unified Language-Image Pretraining}, + doi = {10.48550/ARXIV.2503.15485}, + eprint = {2503.15485}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Tang2025 - TULIP_ Towards Unified Language Image Pretraining.pdf:PDF:http\://arxiv.org/pdf/2503.15485v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Maity2025, + author = {Maity, Subhajit and Hitsman, Killian and Li, Xin and Dutta, Aritra}, + date = {2025-03-13}, + title = {Kolmogorov-Arnold Attention: Is Learnable Attention Better For Vision Transformers?}, + doi = {10.48550/ARXIV.2503.10632}, + eprint = {2503.10632}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Maity2025 - Kolmogorov Arnold Attention_ Is Learnable Attention Better for Vision Transformers_.pdf:PDF:http\://arxiv.org/pdf/2503.10632v1}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, I.2.6; I.5.1; I.5.5; I.5.4; I.4.10, 68T07}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Hammoud2025, + author = {Hammoud, Hasan Abed Al Kader and Ghanem, Bernard}, + date = {2025-03-09}, + title = {DiffCLIP: Differential Attention Meets CLIP}, + doi = {10.48550/ARXIV.2503.06626}, + eprint = {2503.06626}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Hammoud2025 - DiffCLIP_ Differential Attention Meets CLIP.pdf:PDF:http\://arxiv.org/pdf/2503.06626v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Luo2025, + author = {Luo, Alan and Yuan, Kaiwen}, + date = {2025-03-06}, + title = {Simple Self Organizing Map with Visual Transformer}, + doi = {10.48550/ARXIV.2503.04121}, + eprint = {2503.04121}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Luo2025 - Simple Self Organizing Map with Visual Transformer.pdf:PDF:http\://arxiv.org/pdf/2503.04121v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, 65D19 (Primary)}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Li2025, + author = {Li, Ruining and Boduljak, Gabrijel and Jensen, and {Zhou}}, + date = {2025-04-03}, + title = {On Vanishing Variance in Transformer Length Generalization}, + doi = {10.48550/ARXIV.2504.02827}, + eprint = {2504.02827}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Li2025 - On Vanishing Variance in Transformer Length Generalization.pdf:PDF:http\://arxiv.org/pdf/2504.02827v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Zhang2025, + author = {Zhang, Zherui and Xu, Rongtao and Zhou, Jie and Wang, Changwei and Pei, Xingtian and Xu, Wenhao and Zhang, Jiguang and Guo, Li and Gao, Longxiang and Xu, Wenbo and Xu, Shibiao}, + date = {2025-05-06}, + title = {Image Recognition with Online Lightweight Vision Transformer: A Survey}, + doi = {10.48550/ARXIV.2505.03113}, + eprint = {2505.03113}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Zhang2025 - Image Recognition with Online Lightweight Vision Transformer_ a Survey.pdf:PDF:http\://arxiv.org/pdf/2505.03113v1}, + groups = {WTF Benchmark}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Xiao2023, + author = {Xiao, Guangxuan and Tian, Yuandong and Chen, Beidi and Han, Song and Lewis, Mike}, + date = {2023-09-29}, + title = {Efficient Streaming Language Models with Attention Sinks}, + doi = {10.48550/ARXIV.2309.17453}, + eprint = {2309.17453}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Xiao2023 - Efficient Streaming Language Models with Attention Sinks.pdf:PDF:http\://arxiv.org/pdf/2309.17453v4}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Joseph2025, + author = {Joseph, Sonia and Suresh, Praneet and Goldfarb, Ethan and Hufe, Lorenz and Gandelsman, Yossi and Graham, Robert and Bzdok, Danilo and Samek, Wojciech and Richards, Blake Aaron}, + date = {2025-04-11}, + title = {Steering CLIP's vision transformer with sparse autoencoders}, + doi = {10.48550/ARXIV.2504.08729}, + eprint = {2504.08729}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Joseph2025 - Steering CLIP's Vision Transformer with Sparse Autoencoders.pdf:PDF:http\://arxiv.org/pdf/2504.08729v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Qian2025, + author = {Qian, Zhoujie}, + date = {2025-04-21}, + title = {ECViT: Efficient Convolutional Vision Transformer with Local-Attention and Multi-scale Stages}, + doi = {10.48550/ARXIV.2504.14825}, + eprint = {2504.14825}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Qian2025 - ECViT_ Efficient Convolutional Vision Transformer with Local Attention and Multi Scale Stages.pdf:PDF:http\://arxiv.org/pdf/2504.14825v1}, + groups = {WTF Benchmark}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Ji2025, + author = {Ji, Yiping and Saratchandran, Hemanth and Moghaddam, Peyman and Lucey, Simon}, + date = {2025-05-04}, + title = {Always Skip Attention}, + doi = {10.48550/ARXIV.2505.01996}, + eprint = {2505.01996}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Ji2025 - Always Skip Attention.pdf:PDF:http\://arxiv.org/pdf/2505.01996v1}, + groups = {WTF Benchmark}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Yamada2025, + author = {Yamada, Yoshihiro}, + date = {2025-04-09}, + title = {CAT: Circular-Convolutional Attention for Sub-Quadratic Transformers}, + doi = {10.48550/ARXIV.2504.06704}, + eprint = {2504.06704}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Yamada2025 - CAT_ Circular Convolutional Attention for Sub Quadratic Transformers.pdf:PDF:http\://arxiv.org/pdf/2504.06704v1}, + groups = {WTF Benchmark}, + keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Liu2025, + author = {Liu, Jiani and Wang, Zhiyuan and Zhang, Zeliang and Huang, Chao and Liang, Susan and Tang, Yunlong and Xu, Chenliang}, + date = {2025-04-15}, + title = {The Sword of Damocles in ViTs: Computational Redundancy Amplifies Adversarial Transferability}, + doi = {10.48550/ARXIV.2504.10804}, + eprint = {2504.10804}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Liu2025 - The Sword of Damocles in ViTs_ Computational Redundancy Amplifies Adversarial Transferability.pdf:PDF:http\://arxiv.org/pdf/2504.10804v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Trivedy2025, + author = {Trivedy, Vivek and Almalki, Amani and Latecki, Longin Jan}, + date = {2025-04-10}, + title = {Learning Object Focused Attention}, + doi = {10.48550/ARXIV.2504.08166}, + eprint = {2504.08166}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Trivedy2025 - Learning Object Focused Attention.pdf:PDF:http\://arxiv.org/pdf/2504.08166v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Dey2025, + author = {Dey, Nolan and Zhang, Bin Claire and Noci, Lorenzo and Li, Mufan and Bordelon, Blake and Bergsma, Shane and Pehlevan, Cengiz and Hanin, Boris and Hestness, Joel}, + date = {2025-05-02}, + title = {Don't be lazy: CompleteP enables compute-efficient deep transformers}, + doi = {10.48550/ARXIV.2505.01618}, + eprint = {2505.01618}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Dey2025 - Don't Be Lazy_ CompleteP Enables Compute Efficient Deep Transformers.pdf:PDF:http\://arxiv.org/pdf/2505.01618v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Zuhri2025, + author = {Zuhri, Zayd M. K. and Fuadi, Erland Hilman and Aji, Alham Fikri}, + date = {2025-04-29}, + title = {Softpick: No Attention Sink, No Massive Activations with Rectified Softmax}, + doi = {10.48550/ARXIV.2504.20966}, + eprint = {2504.20966}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Share Alike 4.0 International}, + file = {:Zuhri2025 - Softpick_ No Attention Sink, No Massive Activations with Rectified Softmax.pdf:PDF:http\://arxiv.org/pdf/2504.20966v1}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Bolya2025, + author = {Bolya, Daniel and Huang, Po-Yao and Sun, Peize and Cho, Jang Hyun and Madotto, Andrea and Wei, Chen and Ma, Tengyu and Zhi, Jiale and Rajasegaran, Jathushan and Rasheed, Hanoona and Wang, Junke and Monteiro, Marco and Xu, Hu and Dong, Shiyu and Ravi, Nikhila and Li, Daniel and Dollár, Piotr and Feichtenhofer, Christoph}, + date = {2025-04-17}, + title = {Perception Encoder: The best visual embeddings are not at the output of the network}, + doi = {10.48550/ARXIV.2504.13181}, + eprint = {2504.13181}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Bolya2025 - Perception Encoder_ the Best Visual Embeddings Are Not at the Output of the Network.pdf:PDF:http\://arxiv.org/pdf/2504.13181v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Kang2022, + author = {Kang, Ji-Soo and Chung, Kyungyong}, + date = {2022}, + journaltitle = {IEEE Access}, + title = {STAug: Copy-Paste Based Image Augmentation Technique Using Salient Target}, + doi = {10.1109/access.2022.3224141}, + issn = {2169-3536}, + pages = {123605--123613}, + volume = {10}, + file = {:Kang2022 - STAug_ Copy Paste Based Image Augmentation Technique Using Salient Target.pdf:PDF}, + priority = {prio1}, + publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, +} + +@Article{Guo2023, + author = {Guo, Yilu and Shi, Xingyue and Chen, Weijie and Yang, Shicai and Xie, Di and Pu, Shiliang and Zhuang, Yueting}, + date = {2023-01-12}, + title = {1st Place Solution for ECCV 2022 OOD-CV Challenge Image Classification Track}, + doi = {10.48550/ARXIV.2301.04795}, + eprint = {2301.04795}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Guo2023 - 1st Place Solution for ECCV 2022 OOD CV Challenge Image Classification Track.pdf:PDF:http\://arxiv.org/pdf/2301.04795v1}, + groups = {ForAug}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Lappe2025, + author = {Lappe, Alexander and Giese, Martin A.}, + date = {2025-05-09}, + title = {Register and CLS tokens yield a decoupling of local and global features in large ViTs}, + doi = {10.48550/ARXIV.2505.05892}, + eprint = {2505.05892}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Lappe2025 - Register and CLS Tokens Yield a Decoupling of Local and Global Features in Large ViTs.pdf:PDF:http\://arxiv.org/pdf/2505.05892v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Adeel2025, + author = {Adeel, Ahsan}, + date = {2025-05-02}, + title = {Beyond Attention: Toward Machines with Intrinsic Higher Mental States}, + doi = {10.48550/ARXIV.2505.06257}, + eprint = {2505.06257}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, + file = {:Adeel2025 - Beyond Attention_ toward Machines with Intrinsic Higher Mental States.pdf:PDF:http\://arxiv.org/pdf/2505.06257v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Gerber2025, + author = {Gerber, Isaac}, + date = {2025-05-10}, + title = {Attention Is Not All You Need: The Importance of Feedforward Networks in Transformer Models}, + doi = {10.48550/ARXIV.2505.06633}, + eprint = {2505.06633}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Gerber2025 - Attention Is Not All You Need_ the Importance of Feedforward Networks in Transformer Models.pdf:PDF:http\://arxiv.org/pdf/2505.06633v1}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Kan2025, + author = {Kan, Kelvin and Li, Xingjian and Zhang, Benjamin J. and Sahai, Tuhin and Osher, Stanley and Katsoulakis, Markos A.}, + date = {2025-05-16}, + title = {Optimal Control for Transformer Architectures: Enhancing Generalization, Robustness and Efficiency}, + doi = {10.48550/ARXIV.2505.13499}, + eprint = {2505.13499}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Kan2025 - Optimal Control for Transformer Architectures_ Enhancing Generalization, Robustness and Efficiency.pdf:PDF:http\://arxiv.org/pdf/2505.13499v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Optimization and Control (math.OC), FOS: Computer and information sciences, FOS: Mathematics}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@InProceedings{Amsel2025, + author = {Noah Amsel and Gilad Yehudai and Joan Bruna}, + booktitle = {The Thirteenth International Conference on Learning Representations}, + title = {Quality over Quantity in Attention Layers: When Adding More Heads Hurts}, + url = {https://openreview.net/forum?id=y9Xp9NozPR}, + file = {:Amsel2025 - Quality Over Quantity in Attention Layers_ When Adding More Heads Hurts.pdf:PDF}, + priority = {prio2}, + year = {2025}, +} + +@Article{Nordstroem2025, + author = {Nordström, David and Edstedt, Johan and Kahl, Fredrik and Bökman, Georg}, + date = {2025-05-21}, + title = {Stronger ViTs With Octic Equivariance}, + doi = {10.48550/ARXIV.2505.15441}, + eprint = {2505.15441}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Share Alike 4.0 International}, + file = {:Nordstroem2025 - Stronger ViTs with Octic Equivariance.pdf:PDF:http\://arxiv.org/pdf/2505.15441v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Braso2025, + author = {Brasó, Guillem and Ošep, Aljoša and Leal-Taixé, Laura}, + date = {2025-05-22}, + title = {Native Segmentation Vision Transformers}, + doi = {10.48550/ARXIV.2505.16993}, + eprint = {2505.16993}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Braso2025 - Native Segmentation Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2505.16993v1}, + groups = {WTF Benchmark}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Shan2025, + author = {Shan, Jiquan and Wang, Junxiao and Zhao, Lifeng and Cai, Liang and Zhang, Hongyuan and Liritzis, Ioannis}, + date = {2025-05-22}, + title = {AnchorFormer: Differentiable Anchor Attention for Efficient Vision Transformer}, + doi = {10.48550/ARXIV.2505.16463}, + eprint = {2505.16463}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Shan2025 - AnchorFormer_ Differentiable Anchor Attention for Efficient Vision Transformer.pdf:PDF:http\://arxiv.org/pdf/2505.16463v1}, + groups = {WTF Benchmark}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Ye2024a, + author = {Ye, Tianzhu and Dong, Li and Xia, Yuqing and Sun, Yutao and Zhu, Yi and Huang, Gao and Wei, Furu}, + date = {2024-10-07}, + title = {Differential Transformer}, + doi = {10.48550/ARXIV.2410.05258}, + eprint = {2410.05258}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Ye2024a - Differential Transformer.pdf:PDF:http\://arxiv.org/pdf/2410.05258v2}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Fuller2025, + author = {Fuller, Anthony and Yassin, Yousef and Wen, Junfeng and Kyrollos, Daniel G. and Ibrahim, Tarek and Green, James R. and Shelhamer, Evan}, + date = {2025-05-23}, + title = {LookWhere? Efficient Visual Recognition by Learning Where to Look and What to See from Self-Supervision}, + doi = {10.48550/ARXIV.2505.18051}, + eprint = {2505.18051}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Fuller2025 - LookWhere_ Efficient Visual Recognition by Learning Where to Look and What to See from Self Supervision.pdf:PDF:http\://arxiv.org/pdf/2505.18051v1}, + groups = {WTF Benchmark}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Shahabodini2025, + author = {Shahabodini, Sajjad and Mansoori, Mobina and Bayatmakou, Farnoush and Abouei, Jamshid and Plataniotis, Konstantinos N. and Mohammadi, Arash}, + date = {2025-05-26}, + title = {The Missing Point in Vision Transformers for Universal Image Segmentation}, + doi = {10.48550/ARXIV.2505.19795}, + eprint = {2505.19795}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Shahabodini2025 - The Missing Point in Vision Transformers for Universal Image Segmentation.pdf:PDF:http\://arxiv.org/pdf/2505.19795v1}, + groups = {WTF Benchmark}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Zheng2025a, + author = {Zheng, Jianqiao and Li, Xueqian and Saratchandran, Hemanth and Lucey, Simon}, + date = {2025-05-26}, + title = {Structured Initialization for Vision Transformers}, + doi = {10.48550/ARXIV.2505.19985}, + eprint = {2505.19985}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Zheng2025a - Structured Initialization for Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2505.19985v1}, + groups = {Reading Group Potential}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Kong2025, + author = {Kong, Zhenglun and Li, Yize and Zeng, Fanhu and Xin, Lei and Messica, Shvat and Lin, Xue and Zhao, Pu and Kellis, Manolis and Tang, Hao and Zitnik, Marinka}, + date = {2025-05-23}, + title = {Token Reduction Should Go Beyond Efficiency in Generative Models -- From Vision, Language to Multimodality}, + doi = {10.48550/ARXIV.2505.18227}, + eprint = {2505.18227}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Kong2025 - Token Reduction Should Go beyond Efficiency in Generative Models from Vision, Language to Multimodality.pdf:PDF:http\://arxiv.org/pdf/2505.18227v1}, + groups = {WTF Benchmark}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@InProceedings{Chowdhury2025, + author = {Chowdhury, Amartya Roy and Diddigi, Raghuram Bharadwaj and J, Prabuchandran K and Tripathi, Achyut Mani}, + booktitle = {Proceedings of the Winter Conference on Applications of Computer Vision (WACV)}, + title = {Bandit Based Attention Mechanism in Vision Transformers}, + pages = {9579-9588}, + file = {:Chowdhury2025 - Bandit Based Attention Mechanism in Vision Transformers.pdf:PDF}, + groups = {WTF Benchmark}, + month = {February}, + priority = {prio3}, + year = {2025}, +} + +@Article{Fuller2025a, + author = {Fuller, Anthony and Yassin, Yousef and Kyrollos, Daniel G. and Shelhamer, Evan and Green, James R.}, + date = {2025-02-20}, + title = {Simpler Fast Vision Transformers with a Jumbo CLS Token}, + doi = {10.48550/ARXIV.2502.15021}, + eprint = {2502.15021}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Fuller2025a - Simpler Fast Vision Transformers with a Jumbo CLS Token.pdf:PDF:http\://arxiv.org/pdf/2502.15021v2}, + groups = {WTF Benchmark}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Yao2025, + author = {Yao, Jingfeng and Yang, Bin and Wang, Xinggang}, + date = {2025-01-02}, + title = {Reconstruction vs. Generation: Taming Optimization Dilemma in Latent Diffusion Models}, + doi = {10.48550/ARXIV.2501.01423}, + eprint = {2501.01423}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Yao2025 - Reconstruction Vs. Generation_ Taming Optimization Dilemma in Latent Diffusion Models.pdf:PDF:http\://arxiv.org/pdf/2501.01423v3}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Darlow2025, + author = {Darlow, Luke and Regan, Ciaran and Risi, Sebastian and Seely, Jeffrey and Jones, Llion}, + date = {2025-05-08}, + title = {Continuous Thought Machines}, + doi = {10.48550/ARXIV.2505.05522}, + eprint = {2505.05522}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Darlow2025 - Continuous Thought Machines.pdf:PDF:http\://arxiv.org/pdf/2505.05522v3}, + groups = {Reading Group Potential}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Deng2025, + author = {Deng, Chaorui and Zhu, Deyao and Li, Kunchang and Gou, Chenhui and Li, Feng and Wang, Zeyu and Zhong, Shu and Yu, Weihao and Nie, Xiaonan and Song, Ziang and Shi, Guang and Fan, Haoqi}, + date = {2025-05-20}, + title = {Emerging Properties in Unified Multimodal Pretraining}, + doi = {10.48550/ARXIV.2505.14683}, + eprint = {2505.14683}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Deng2025 - Emerging Properties in Unified Multimodal Pretraining.pdf:PDF:http\://arxiv.org/pdf/2505.14683v2}, + groups = {Reading Group Potential}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Nie2025, + author = {Nie, Shen and Zhu, Fengqi and You, Zebin and Zhang, Xiaolu and Ou, Jingyang and Hu, Jun and Zhou, Jun and Lin, Yankai and Wen, Ji-Rong and Li, Chongxuan}, + date = {2025-02-14}, + title = {Large Language Diffusion Models}, + doi = {10.48550/ARXIV.2502.09992}, + eprint = {2502.09992}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Nie2025 - Large Language Diffusion Models.pdf:PDF:http\://arxiv.org/pdf/2502.09992v2}, + groups = {Reading Group Potential}, + keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Jha2025, + author = {Jha, Rishi and Zhang, Collin and Shmatikov, Vitaly and Morris, John X.}, + date = {2025-05-18}, + title = {Harnessing the Universal Geometry of Embeddings}, + doi = {10.48550/ARXIV.2505.12540}, + eprint = {2505.12540}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Jha2025 - Harnessing the Universal Geometry of Embeddings.pdf:PDF:http\://arxiv.org/pdf/2505.12540v2}, + groups = {Reading Group Potential}, + keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Xu2025, + author = {Xu, Yi and Li, Chengzu and Zhou, Han and Wan, Xingchen and Zhang, Caiqi and Korhonen, Anna and Vulić, Ivan}, + date = {2025-05-16}, + title = {Visual Planning: Let's Think Only with Images}, + doi = {10.48550/ARXIV.2505.11409}, + eprint = {2505.11409}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Xu2025 - Visual Planning_ Let's Think Only with Images.pdf:PDF:http\://arxiv.org/pdf/2505.11409v1}, + groups = {Reading Group Potential}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2025}, +} + +@InProceedings{Wang2024f, + author = {Qizhou Wang and Yong Lin and Yongqiang Chen and Ludwig Schmidt and Bo Han and Tong Zhang}, + booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems}, + title = {A Sober Look at the Robustness of {CLIP}s to Spurious Features}, + eprint = {2403.11497}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Wang2024f - A Sober Look at the Robustness of CLIPs to Spurious Features.pdf:PDF:http\://arxiv.org/pdf/2403.11497v2}, + groups = {ForAug}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + priority = {prio2}, + year = {2024}, +} + +@Article{Aghagolzadeh2025, + author = {Aghagolzadeh, Hossein and Ezoji, Mehdi}, + date = {2025-02-01}, + title = {Contrastive Forward-Forward: A Training Algorithm of Vision Transformer}, + doi = {10.48550/ARXIV.2502.00571}, + eprint = {2502.00571}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Aghagolzadeh2025 - Contrastive Forward Forward_ a Training Algorithm of Vision Transformer.pdf:PDF:http\://arxiv.org/pdf/2502.00571v1}, + groups = {Reading Group Potential}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Qin2024, + author = {Qin, Tian and Deng, Zhiwei and Alvarez-Melis, David}, + date = {2024-06-15}, + title = {A Label is Worth a Thousand Images in Dataset Distillation}, + doi = {10.48550/ARXIV.2406.10485}, + eprint = {2406.10485}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + abstract = {Data $\textit{quality}$ is a crucial factor in the performance of machine learning models, a principle that dataset distillation methods exploit by compressing training datasets into much smaller counterparts that maintain similar downstream performance. Understanding how and why data distillation methods work is vital not only for improving these methods but also for revealing fundamental characteristics of "good" training data. However, a major challenge in achieving this goal is the observation that distillation approaches, which rely on sophisticated but mostly disparate methods to generate synthetic data, have little in common with each other. In this work, we highlight a largely overlooked aspect common to most of these methods: the use of soft (probabilistic) labels. Through a series of ablation experiments, we study the role of soft labels in depth. Our results reveal that the main factor explaining the performance of state-of-the-art distillation methods is not the specific techniques used to generate synthetic data but rather the use of soft labels. Furthermore, we demonstrate that not all soft labels are created equal; they must contain $\textit{structured information}$ to be beneficial. We also provide empirical scaling laws that characterize the effectiveness of soft labels as a function of images-per-class in the distilled dataset and establish an empirical Pareto frontier for data-efficient learning. Combined, our findings challenge conventional wisdom in dataset distillation, underscore the importance of soft labels in learning, and suggest new directions for improving distillation methods. Code for all experiments is available at https://github.com/sunnytqin/no-distillation.}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Qin2024 - A Label Is Worth a Thousand Images in Dataset Distillation.pdf:PDF:http\://arxiv.org/pdf/2406.10485v2}, + groups = {Reading Group Potential}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Kamboj2024, + author = {Kamboj, Abhi and Do, Minh}, + date = {2024-03-17}, + title = {A Survey of IMU Based Cross-Modal Transfer Learning in Human Activity Recognition}, + doi = {10.48550/ARXIV.2403.15444}, + eprint = {2403.15444}, + eprintclass = {eess.SP}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Kamboj2024 - A Survey of IMU Based Cross Modal Transfer Learning in Human Activity Recognition.pdf:PDF:http\://arxiv.org/pdf/2403.15444v1}, + keywords = {Signal Processing (eess.SP), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Image and Video Processing (eess.IV), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@InProceedings{Xing2018, + author = {Xing, Tianwei and Sandha, Sandeep Singh and Balaji, Bharathan and Chakraborty, Supriyo and Srivastava, Mani}, + booktitle = {Proceedings of the 1st International Workshop on Edge Systems, Analytics and Networking}, + title = {Enabling Edge Devices that Learn from Each Other: Cross Modal Training for Activity Recognition}, + doi = {10.1145/3213344.3213351}, + isbn = {9781450358378}, + location = {Munich, Germany}, + pages = {37–42}, + publisher = {Association for Computing Machinery}, + series = {EdgeSys'18}, + url = {https://doi.org/10.1145/3213344.3213351}, + address = {New York, NY, USA}, + file = {:Xing2018 - Enabling Edge Devices That Learn from Each Other_ Cross Modal Training for Activity Recognition.pdf:PDF}, + keywords = {activity recognition, cross modality, edge devices, shared latent representation, transfer learning}, + numpages = {6}, + priority = {prio2}, + year = {2018}, +} + +@InProceedings{Georgakis2022, + author = {Georgakis, Georgios and Schmeckpeper, Karl and Wanchoo, Karan and Dan, Soham and Miltsakaki, Eleni and Roth, Dan and Daniilidis, Kostas}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title = {Cross-Modal Map Learning for Vision and Language Navigation}, + pages = {15460-15470}, + file = {:Georgakis2022 - Cross Modal Map Learning for Vision and Language Navigation.pdf:PDF}, + month = {June}, + priority = {prio2}, + year = {2022}, +} + +@Article{Ma2024a, + author = {Ma, Wenxuan and Li, Shuang and Cai, Lincan and Kang, Jingxuan}, + date = {2024-06-27}, + title = {Learning Modality Knowledge Alignment for Cross-Modality Transfer}, + doi = {10.48550/ARXIV.2406.18864}, + eprint = {2406.18864}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Ma2024a - Learning Modality Knowledge Alignment for Cross Modality Transfer.pdf:PDF:http\://arxiv.org/pdf/2406.18864v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2024}, +} + +@InProceedings{Xue2023, + author = {Zihui Xue and Zhengqi Gao and Sucheng Ren and Hang Zhao}, + booktitle = {The Eleventh International Conference on Learning Representations}, + title = {The Modality Focusing Hypothesis: Towards Understanding Crossmodal Knowledge Distillation}, + url = {https://openreview.net/forum?id=w0QXrZ3N-s}, + file = {:Xue2023 - The Modality Focusing Hypothesis_ Towards Understanding Crossmodal Knowledge Distillation.pdf:PDF}, + priority = {prio1}, + year = {2023}, +} + +@Article{Mansourian2025, + author = {Mansourian, Amir M. and Ahmadi, Rozhan and Ghafouri, Masoud and Babaei, Amir Mohammad and Golezani, Elaheh Badali and Ghamchi, Zeynab Yasamani and Ramezanian, Vida and Taherian, Alireza and Dinashi, Kimia and Miri, Amirali and Kasaei, Shohreh}, + date = {2025-03-15}, + title = {A Comprehensive Survey on Knowledge Distillation}, + doi = {10.48550/ARXIV.2503.12067}, + eprint = {2503.12067}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Mansourian2025 - A Comprehensive Survey on Knowledge Distillation.pdf:PDF:http\://arxiv.org/pdf/2503.12067v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@InProceedings{Peng2019, + author = {Peng, Baoyun and Jin, Xiao and Liu, Jiaheng and Li, Dongsheng and Wu, Yichao and Liu, Yu and Zhou, Shunfeng and Zhang, Zhaoning}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {Correlation Congruence for Knowledge Distillation}, + file = {:Peng2019 - Correlation Congruence for Knowledge Distillation.pdf:PDF}, + month = {October}, + priority = {prio1}, + year = {2019}, +} + +@InProceedings{Tung2019, + author = {Tung, Frederick and Mori, Greg}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + title = {Similarity-Preserving Knowledge Distillation}, + file = {:Tung2019 - Similarity Preserving Knowledge Distillation.pdf:PDF}, + month = {October}, + priority = {prio1}, + year = {2019}, +} + +@Article{Zhao2024, + author = {Zhao, Hongbo and Ni, Bolin and Wang, Haochen and Fan, Junsong and Zhu, Fei and Wang, Yuxi and Chen, Yuntao and Meng, Gaofeng and Zhang, Zhaoxiang}, + date = {2024-03-18}, + title = {Continual Forgetting for Pre-trained Vision Models}, + doi = {10.48550/ARXIV.2403.11530}, + eprint = {2403.11530}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhao2024 - Continual Forgetting for Pre Trained Vision Models.pdf:PDF:http\://arxiv.org/pdf/2403.11530v2}, + groups = {Reading Group Potential}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Shani2025, + author = {Shani, Chen and Jurafsky, Dan and LeCun, Yann and Shwartz-Ziv, Ravid}, + date = {2025-05-21}, + title = {From Tokens to Thoughts: How LLMs and Humans Trade Compression for Meaning}, + doi = {10.48550/ARXIV.2505.17117}, + eprint = {2505.17117}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Shani2025 - From Tokens to Thoughts_ How LLMs and Humans Trade Compression for Meaning.pdf:PDF:http\://arxiv.org/pdf/2505.17117v3}, + groups = {Reading Group Potential}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Information Theory (cs.IT), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Huh2024, + author = {Huh, Minyoung and Cheung, Brian and Wang, Tongzhou and Isola, Phillip}, + date = {2024-05-13}, + title = {The Platonic Representation Hypothesis}, + doi = {10.48550/ARXIV.2405.07987}, + eprint = {2405.07987}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Huh2024 - The Platonic Representation Hypothesis.pdf:PDF:http\://arxiv.org/pdf/2405.07987v5}, + groups = {Reading Group Potential}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Dohmatob2024, + author = {Dohmatob, Elvis and Feng, Yunzhen and Yang, Pu and Charton, Francois and Kempe, Julia}, + date = {2024-02-10}, + journaltitle = {ICML 2024}, + title = {A Tale of Tails: Model Collapse as a Change of Scaling Laws}, + doi = {10.48550/ARXIV.2402.07043}, + eprint = {2402.07043}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + abstract = {As AI model size grows, neural scaling laws have become a crucial tool to predict the improvements of large models when increasing capacity and the size of original (human or natural) training data. Yet, the widespread use of popular models means that the ecosystem of online data and text will co-evolve to progressively contain increased amounts of synthesized data. In this paper we ask: How will the scaling laws change in the inevitable regime where synthetic data makes its way into the training corpus? Will future models, still improve, or be doomed to degenerate up to total (model) collapse? We develop a theoretical framework of model collapse through the lens of scaling laws. We discover a wide range of decay phenomena, analyzing loss of scaling, shifted scaling with number of generations, the ''un-learning" of skills, and grokking when mixing human and synthesized data. Our theory is validated by large-scale experiments with a transformer on an arithmetic task and text generation using the large language model Llama2.}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Dohmatob2024 - A Tale of Tails_ Model Collapse As a Change of Scaling Laws.pdf:PDF:http\://arxiv.org/pdf/2402.07043v2}, + groups = {Reading Group Potential}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Morawiecki2022, + author = {Morawiecki, Paweł and Krutsylo, Andrii and Wołczyk, Maciej and Śmieja, Marek}, + date = {2022-06-28}, + title = {Hebbian Continual Representation Learning}, + doi = {10.48550/ARXIV.2207.04874}, + eprint = {2207.04874}, + eprintclass = {cs.NE}, + eprinttype = {arXiv}, + abstract = {Continual Learning aims to bring machine learning into a more realistic scenario, where tasks are learned sequentially and the i.i.d. assumption is not preserved. Although this setting is natural for biological systems, it proves very difficult for machine learning models such as artificial neural networks. To reduce this performance gap, we investigate the question whether biologically inspired Hebbian learning is useful for tackling continual challenges. In particular, we highlight a realistic and often overlooked unsupervised setting, where the learner has to build representations without any supervision. By combining sparse neural networks with Hebbian learning principle, we build a simple yet effective alternative (HebbCL) to typical neural network models trained via the gradient descent. Due to Hebbian learning, the network have easily interpretable weights, which might be essential in critical application such as security or healthcare. We demonstrate the efficacy of HebbCL in an unsupervised learning setting applied to MNIST and Omniglot datasets. We also adapt the algorithm to the supervised scenario and obtain promising results in the class-incremental learning.}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Morawiecki2022 - Hebbian Continual Representation Learning.pdf:PDF:http\://arxiv.org/pdf/2207.04874v1}, + groups = {Reading Group Potential}, + keywords = {Neural and Evolutionary Computing (cs.NE), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Mason2025, + author = {Mason, Sebastian Ray and Gjølbye, Anders and Højbjerg, Phillip Chavarria and Tětková, Lenka and Hansen, Lars Kai}, + date = {2025-09-18}, + title = {Large Vision Models Can Solve Mental Rotation Problems}, + doi = {10.48550/ARXIV.2509.15271}, + eprint = {2509.15271}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + abstract = {Mental rotation is a key test of spatial reasoning in humans and has been central to understanding how perception supports cognition. Despite the success of modern vision transformers, it is still unclear how well these models develop similar abilities. In this work, we present a systematic evaluation of ViT, CLIP, DINOv2, and DINOv3 across a range of mental-rotation tasks, from simple block structures similar to those used by Shepard and Metzler to study human cognition, to more complex block figures, three types of text, and photo-realistic objects. By probing model representations layer by layer, we examine where and how these networks succeed. We find that i) self-supervised ViTs capture geometric structure better than supervised ViTs; ii) intermediate layers perform better than final layers; iii) task difficulty increases with rotation complexity and occlusion, mirroring human reaction times and suggesting similar constraints in embedding space representations.}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Mason2025 - Large Vision Models Can Solve Mental Rotation Problems.pdf:PDF:http\://arxiv.org/pdf/2509.15271v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Helbling2025, + author = {Helbling, Alec and Meral, Tuna Han Salih and Hoover, Ben and Yanardag, Pinar and Chau, Duen Horng}, + date = {2025-02-06}, + title = {ConceptAttention: Diffusion Transformers Learn Highly Interpretable Features}, + doi = {10.48550/ARXIV.2502.04320}, + eprint = {2502.04320}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + abstract = {Do the rich representations of multi-modal diffusion transformers (DiTs) exhibit unique properties that enhance their interpretability? We introduce ConceptAttention, a novel method that leverages the expressive power of DiT attention layers to generate high-quality saliency maps that precisely locate textual concepts within images. Without requiring additional training, ConceptAttention repurposes the parameters of DiT attention layers to produce highly contextualized concept embeddings, contributing the major discovery that performing linear projections in the output space of DiT attention layers yields significantly sharper saliency maps compared to commonly used cross-attention maps. ConceptAttention even achieves state-of-the-art performance on zero-shot image segmentation benchmarks, outperforming 15 other zero-shot interpretability methods on the ImageNet-Segmentation dataset. ConceptAttention works for popular image models and even seamlessly generalizes to video generation. Our work contributes the first evidence that the representations of multi-modal DiTs are highly transferable to vision tasks like segmentation.}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Helbling2025 - ConceptAttention_ Diffusion Transformers Learn Highly Interpretable Features.pdf:PDF:http\://arxiv.org/pdf/2502.04320v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Simeoni2025, + author = {Siméoni, Oriane and Vo, Huy V. and Seitzer, Maximilian and Baldassarre, Federico and Oquab, Maxime and Jose, Cijo and Khalidov, Vasil and Szafraniec, Marc and Yi, Seungeun and Ramamonjisoa, Michaël and Massa, Francisco and Haziza, Daniel and Wehrstedt, Luca and Wang, Jianyuan and Darcet, Timothée and Moutakanni, Théo and Sentana, Leonel and Roberts, Claire and Vedaldi, Andrea and Tolan, Jamie and Brandt, John and Couprie, Camille and Mairal, Julien and Jégou, Hervé and Labatut, Patrick and Bojanowski, Piotr}, + date = {2025-08-13}, + title = {DINOv3}, + doi = {10.48550/ARXIV.2508.10104}, + eprint = {2508.10104}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + abstract = {Self-supervised learning holds the promise of eliminating the need for manual data annotation, enabling models to scale effortlessly to massive datasets and larger architectures. By not being tailored to specific tasks or domains, this training paradigm has the potential to learn visual representations from diverse sources, ranging from natural to aerial images -- using a single algorithm. This technical report introduces DINOv3, a major milestone toward realizing this vision by leveraging simple yet effective strategies. First, we leverage the benefit of scaling both dataset and model size by careful data preparation, design, and optimization. Second, we introduce a new method called Gram anchoring, which effectively addresses the known yet unsolved issue of dense feature maps degrading during long training schedules. Finally, we apply post-hoc strategies that further enhance our models' flexibility with respect to resolution, model size, and alignment with text. As a result, we present a versatile vision foundation model that outperforms the specialized state of the art across a broad range of settings, without fine-tuning. DINOv3 produces high-quality dense features that achieve outstanding performance on various vision tasks, significantly surpassing previous self- and weakly-supervised foundation models. We also share the DINOv3 suite of vision models, designed to advance the state of the art on a wide spectrum of tasks and data by providing scalable solutions for diverse resource constraints and deployment scenarios.}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Simeoni2025 - DINOv3.pdf:PDF:http\://arxiv.org/pdf/2508.10104v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Zhou2022a, + author = {Zhou, Minghao and Wang, Quanziang and Shu, Jun and Zhao, Qian and Meng, Deyu}, + date = {2022-02-16}, + title = {Diagnosing Batch Normalization in Class Incremental Learning}, + doi = {10.48550/ARXIV.2202.08025}, + eprint = {2202.08025}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + abstract = {Extensive researches have applied deep neural networks (DNNs) in class incremental learning (Class-IL). As building blocks of DNNs, batch normalization (BN) standardizes intermediate feature maps and has been widely validated to improve training stability and convergence. However, we claim that the direct use of standard BN in Class-IL models is harmful to both the representation learning and the classifier training, thus exacerbating catastrophic forgetting. In this paper we investigate the influence of BN on Class-IL models by illustrating such BN dilemma. We further propose BN Tricks to address the issue by training a better feature extractor while eliminating classification bias. Without inviting extra hyperparameters, we apply BN Tricks to three baseline rehearsal-based methods, ER, DER++ and iCaRL. Through comprehensive experiments conducted on benchmark datasets of Seq-CIFAR-10, Seq-CIFAR-100 and Seq-Tiny-ImageNet, we show that BN Tricks can bring significant performance gains to all adopted baselines, revealing its potential generality along this line of research.}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhou2022a - Diagnosing Batch Normalization in Class Incremental Learning.pdf:PDF:http\://arxiv.org/pdf/2202.08025v1}, + groups = {Reading Group Potential}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Kosowski2025, + author = {Kosowski, Adrian and Uznański, Przemysław and Chorowski, Jan and Stamirowska, Zuzanna and Bartoszkiewicz, Michał}, + date = {2025-09-30}, + title = {The Dragon Hatchling: The Missing Link between the Transformer and Models of the Brain}, + doi = {10.48550/ARXIV.2509.26507}, + eprint = {2509.26507}, + eprintclass = {cs.NE}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Kosowski2025 - The Dragon Hatchling_ the Missing Link between the Transformer and Models of the Brain.pdf:PDF:http\://arxiv.org/pdf/2509.26507v1}, + keywords = {Neural and Evolutionary Computing (cs.NE), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Ruiz2020, + author = {Ruiz, Alejandro Hernandez and Vilalta, Armand and Moreno-Noguer, Francesc}, + date = {2020-06-22}, + title = {Neural Cellular Automata Manifold}, + doi = {10.48550/ARXIV.2006.12155}, + eprint = {2006.12155}, + eprintclass = {cs.NE}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Ruiz2020 - Neural Cellular Automata Manifold.pdf:PDF:http\://arxiv.org/pdf/2006.12155v3}, + groups = {Reading Group Potential}, + keywords = {Neural and Evolutionary Computing (cs.NE), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2020}, +} + +@InProceedings{Rahat2025, + author = {Rahat, Fazle and Hossain, M Shifat and Ahmed, Md Rubel and Jha, Sumit Kumar and Ewetz, Rickard}, + booktitle = {Proceedings of the Winter Conference on Applications of Computer Vision (WACV)}, + title = {Data Augmentation for Image Classification using Generative AI}, + pages = {4173-4182}, + file = {:Rahat2025 - Data Augmentation for Image Classification Using Generative AI.pdf:PDF}, + month = {February}, + priority = {prio1}, + year = {2025}, +} + +@Article{Abdullaev2025, + author = {Abdullaev, Laziz U. and Tkachenko, Maksim and Nguyen, Tan M.}, + date = {2025-06-12}, + title = {Revisiting Transformers with Insights from Image Filtering}, + doi = {10.48550/ARXIV.2506.10371}, + eprint = {2506.10371}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Abdullaev2025 - Revisiting Transformers with Insights from Image Filtering.pdf:PDF:http\://arxiv.org/pdf/2506.10371v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Jiang2025, + author = {Jiang, Nick and Dravid, Amil and Efros, Alexei and Gandelsman, Yossi}, + date = {2025-06-09}, + title = {Vision Transformers Don't Need Trained Registers}, + doi = {10.48550/ARXIV.2506.08010}, + eprint = {2506.08010}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Jiang2025 - Vision Transformers Don't Need Trained Registers.pdf:PDF:http\://arxiv.org/pdf/2506.08010v4}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Dong2025, + author = {Dong, Yihe and Noci, Lorenzo and Khodak, Mikhail and Li, Mufan}, + date = {2025-06-01}, + title = {Is Random Attention Sufficient for Sequence Modeling? Disentangling Trainable Components in the Transformer}, + doi = {10.48550/ARXIV.2506.01115}, + eprint = {2506.01115}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Dong2025 - Is Random Attention Sufficient for Sequence Modeling_ Disentangling Trainable Components in the Transformer.pdf:PDF:http\://arxiv.org/pdf/2506.01115v3}, + groups = {Reading Group Potential}, + keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Nadeem2025, + author = {Nadeem, Numair and Anwar, Saeed and Asad, Muhammad Hamza and Bais, Abdul}, + date = {2025-06-16}, + title = {HVL: Semi-Supervised Segmentation leveraging Hierarchical Vision-Language Synergy with Dynamic Text-Spatial Query Alignment}, + doi = {10.48550/ARXIV.2506.13925}, + eprint = {2506.13925}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Nadeem2025 - HVL_ Semi Supervised Segmentation Leveraging Hierarchical Vision Language Synergy with Dynamic Text Spatial Query Alignment.pdf:PDF:http\://arxiv.org/pdf/2506.13925v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Li2025a, + author = {Li, Tianqin and Wen, Ziqi and Song, Leiran and Liu, Jun and Jing, Zhi and Lee, Tai Sing}, + date = {2025-05-31}, + title = {From Local Cues to Global Percepts: Emergent Gestalt Organization in Self-Supervised Vision Models}, + doi = {10.48550/ARXIV.2506.00718}, + eprint = {2506.00718}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Li2025a - From Local Cues to Global Percepts_ Emergent Gestalt Organization in Self Supervised Vision Models.pdf:PDF:http\://arxiv.org/pdf/2506.00718v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@InProceedings{Peng2025, + author = {Peng, Zelin and Huang, Yu and Xu, Zhengqin and Tang, Feilong and Hu, Ming and Yang, Xiaokang and Shen, Wei}, + booktitle = {2025 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title = {Star with Bilinear Mapping}, + doi = {10.1109/CVPR52734.2025.02355}, + pages = {25292-25302}, + file = {:Peng2025 - Star with Bilinear Mapping.pdf:PDF}, + keywords = {Computer vision;Computational modeling;Semantic segmentation;Stars;Computer architecture;Transformers;Complexity theory;Computational efficiency;Context modeling;Image classification}, + priority = {prio3}, + year = {2025}, +} + +@Article{Kuzucu2025, + author = {Kuzucu, Selim and Naeem, Muhammad Ferjad and Kukleva, Anna and Tombari, Federico and Schiele, Bernt}, + date = {2025-07-01}, + title = {Language-Unlocked ViT (LUViT): Empowering Self-Supervised Vision Transformers with LLMs}, + doi = {10.48550/ARXIV.2507.00754}, + eprint = {2507.00754}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}, + file = {:Kuzucu2025 - Language Unlocked ViT (LUViT)_ Empowering Self Supervised Vision Transformers with LLMs.pdf:PDF:http\://arxiv.org/pdf/2507.00754v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Hanna2025, + author = {Hanna, Joelle and Borth, Damian}, + date = {2025-07-09}, + title = {Know Your Attention Maps: Class-specific Token Masking for Weakly Supervised Semantic Segmentation}, + doi = {10.48550/ARXIV.2507.06848}, + eprint = {2507.06848}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Hanna2025 - Know Your Attention Maps_ Class Specific Token Masking for Weakly Supervised Semantic Segmentation.pdf:PDF:http\://arxiv.org/pdf/2507.06848v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Bai2025, + author = {Bai, Xiaoyan and Pres, Itamar and Deng, Yuntian and Tan, Chenhao and Shieber, Stuart and Viégas, Fernanda and Wattenberg, Martin and Lee, Andrew}, + date = {2025-09-30}, + title = {Why Can't Transformers Learn Multiplication? Reverse-Engineering Reveals Long-Range Dependency Pitfalls}, + doi = {10.48550/ARXIV.2510.00184}, + eprint = {2510.00184}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Bai2025 - Why Can't Transformers Learn Multiplication_ Reverse Engineering Reveals Long Range Dependency Pitfalls.pdf:PDF:http\://arxiv.org/pdf/2510.00184v1}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Yu2025, + author = {Yu, Ruoxi and Jiang, Haotian and Cheng, Jingpu and Yu, Penghao and Li, Qianxiao and Li, Zhong}, + date = {2025-10-04}, + title = {Allocation of Parameters in Transformers}, + doi = {10.48550/ARXIV.2510.03784}, + eprint = {2510.03784}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Yu2025 - Allocation of Parameters in Transformers.pdf:PDF:http\://arxiv.org/pdf/2510.03784v1}, + keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Jain2025, + author = {Jain, Siddharth and Karthik, Shyamgopal and Gandhi, Vineet}, + date = {2025-10-25}, + journaltitle = {Transactions on Machine Learning Research (TMLR), 2025}, + title = {Simplifying Knowledge Transfer in Pretrained Models}, + doi = {10.48550/ARXIV.2510.22208}, + eprint = {2510.22208}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Jain2025 - Simplifying Knowledge Transfer in Pretrained Models.pdf:PDF:http\://arxiv.org/pdf/2510.22208v1}, + keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + priority = {prio2}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Liu2025a, + author = {Liu, Yiming and Zhang, Yuhui and Ghosh, Dhruba and Schmidt, Ludwig and Yeung-Levy, Serena}, + date = {2025-10-13}, + title = {Data or Language Supervision: What Makes CLIP Better than DINO?}, + doi = {10.48550/ARXIV.2510.11835}, + eprint = {2510.11835}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Liu2025a - Data or Language Supervision_ What Makes CLIP Better Than DINO_.pdf:PDF:http\://arxiv.org/pdf/2510.11835v1}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), Multimedia (cs.MM), FOS: Computer and information sciences}, + priority = {prio1}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Qiu2025, + author = {Qiu, Haiquan and Yao, Quanming}, + date = {2025-10-05}, + title = {Why Low-Precision Transformer Training Fails: An Analysis on Flash Attention}, + doi = {10.48550/ARXIV.2510.04212}, + eprint = {2510.04212}, + eprintclass = {cs.LG}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Qiu2025 - Why Low Precision Transformer Training Fails_ an Analysis on Flash Attention.pdf:PDF:http\://arxiv.org/pdf/2510.04212v2}, + keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + priority = {prio3}, + publisher = {arXiv}, + year = {2025}, +} + +@InProceedings{Radford2018, + author = {Alec Radford and Karthik Narasimhan}, + title = {Improving Language Understanding by Generative Pre-Training}, + file = {:Radford2018 - Improving Language Understanding by Generative Pre Training.pdf:PDF}, + year = {2018}, +} + +@Article{Touvron2023, + author = {Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timothée and Rozière, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and Rodriguez, Aurelien and Joulin, Armand and Grave, Edouard and Lample, Guillaume}, + date = {2023-02-27}, + title = {LLaMA: Open and Efficient Foundation Language Models}, + doi = {10.48550/ARXIV.2302.13971}, + eprint = {2302.13971}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:http\://arxiv.org/pdf/2302.13971v1:PDF}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2023}, +} + +@Article{Grattafiori2024, + author = {Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Vaughan, Alex and Yang, Amy and Fan, Angela and Goyal, Anirudh and Hartshorn, Anthony and Yang, Aobo and Mitra, Archi and Sravankumar, Archie and Korenev, Artem and Hinsvark, Arthur and Rao, Arun and Zhang, Aston and Rodriguez, Aurelien and Gregerson, Austen and Spataru, Ava}, + date = {2024-07-31}, + title = {The Llama 3 Herd of Models}, + doi = {10.48550/ARXIV.2407.21783}, + eprint = {2407.21783}, + eprintclass = {cs.AI}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Grattafiori2024 - The Llama 3 Herd of Models.pdf:PDF:http\://arxiv.org/pdf/2407.21783v3}, + keywords = {Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Jia2021, + author = {Jia, Chao and Yang, Yinfei and Xia, Ye and Chen, Yi-Ting and Parekh, Zarana and Pham, Hieu and Le, Quoc V. and Sung, Yunhsuan and Li, Zhen and Duerig, Tom}, + date = {2021-02-11}, + journaltitle = {International Conference on Machine Learning 2021}, + title = {Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision}, + doi = {10.48550/ARXIV.2102.05918}, + eprint = {2102.05918}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Jia2021 - Scaling up Visual and Vision Language Representation Learning with Noisy Text Supervision.pdf:PDF:http\://arxiv.org/pdf/2102.05918v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2021}, +} + +@Article{Li2022c, + author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven}, + date = {2022-01-28}, + title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation}, + doi = {10.48550/ARXIV.2201.12086}, + eprint = {2201.12086}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Li2022c - BLIP_ Bootstrapping Language Image Pre Training for Unified Vision Language Understanding and Generation.pdf:PDF:http\://arxiv.org/pdf/2201.12086v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2022}, +} + +@Article{Thapa2024, + author = {Thapa, Rahul and Chen, Kezhen and Covert, Ian and Chalamala, Rahul and Athiwaratkun, Ben and Song, Shuaiwen Leon and Zou, James}, + date = {2024-06-03}, + title = {Dragonfly: Multi-Resolution Zoom-In Encoding Enhances Vision-Language Models}, + doi = {10.48550/ARXIV.2406.00977}, + eprint = {2406.00977}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Thapa2024 - Dragonfly_ Multi Resolution Zoom in Encoding Enhances Vision Language Models.pdf:PDF:http\://arxiv.org/pdf/2406.00977v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Beyer2024, + author = {Beyer, Lucas and Steiner, Andreas and Pinto, André Susano and Kolesnikov, Alexander and Wang, Xiao and Salz, Daniel and Neumann, Maxim and Alabdulmohsin, Ibrahim and Tschannen, Michael and Bugliarello, Emanuele and Unterthiner, Thomas and Keysers, Daniel and Koppula, Skanda and Liu, Fangyu and Grycner, Adam and Gritsenko, Alexey and Houlsby, Neil and Kumar, Manoj and Rong, Keran and Eisenschlos, Julian and Kabra, Rishabh and Bauer, Matthias and Bošnjak, Matko and Chen, Xi and Minderer, Matthias and Voigtlaender, Paul and Bica, Ioana and Balazevic, Ivana and Puigcerver, Joan and Papalampidi, Pinelopi and Henaff, Olivier and Xiong, Xi and Soricut, Radu and Harmsen, Jeremiah and Zhai, Xiaohua}, + date = {2024-07-10}, + title = {PaliGemma: A versatile 3B VLM for transfer}, + doi = {10.48550/ARXIV.2407.07726}, + eprint = {2407.07726}, + eprintclass = {cs.CV}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Beyer2024 - PaliGemma_ a Versatile 3B VLM for Transfer.pdf:PDF:http\://arxiv.org/pdf/2407.07726v2}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@Article{Yang2025, + author = {Yang, An and Li, Anfeng and Yang, Baosong and Zhang, Beichen and Hui, Binyuan and Zheng, Bo and Yu, Bowen and Gao, Chang and Huang, Chengen and Lv, Chenxu and Zheng, Chujie and Liu, Dayiheng and Zhou, Fan and Huang, Fei and Hu, Feng and Ge, Hao and Wei, Haoran and Lin, Huan and Tang, Jialong and Yang, Jian and Tu, Jianhong and Zhang, Jianwei and Yang, Jianxin and Yang, Jiaxi and Zhou, Jing and Zhou, Jingren and Lin, Junyang and Dang, Kai and Bao, Keqin and Yang, Kexin and Yu, Le and Deng, Lianghao and Li, Mei and Xue, Mingfeng and Li, Mingze and Zhang, Pei and Wang, Peng and Zhu, Qin and Men, Rui and Gao, Ruize and Liu, Shixuan and Luo, Shuang and Li, Tianhao and Tang, Tianyi and Yin, Wenbiao and Ren, Xingzhang and Wang, Xinyu and Zhang, Xinyu and Ren, Xuancheng and Fan, Yang and Su, Yang and Zhang, Yichang and Zhang, Yinger and Wan, Yu and Liu, Yuqiong and Wang, Zekun and Cui, Zeyu and Zhang, Zhenru and Zhou, Zhipeng and Qiu, Zihan}, + date = {2025-05-14}, + title = {Qwen3 Technical Report}, + doi = {10.48550/ARXIV.2505.09388}, + eprint = {2505.09388}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Yang2025 - Qwen3 Technical Report.pdf:PDF}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Zhang2025a, + author = {Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren}, + date = {2025-06-05}, + title = {Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models}, + doi = {10.48550/ARXIV.2506.05176}, + eprint = {2506.05176}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {arXiv.org perpetual, non-exclusive license}, + file = {:Zhang2025a - Qwen3 Embedding_ Advancing Text Embedding and Reranking through Foundation Models.pdf:PDF:http\://arxiv.org/pdf/2506.05176v3}, + keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2025}, +} + +@Article{Lee2024b, + author = {Lee, Chankyu and Roy, Rajarshi and Xu, Mengyao and Raiman, Jonathan and Shoeybi, Mohammad and Catanzaro, Bryan and Ping, Wei}, + date = {2024-05-27}, + title = {NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding Models}, + doi = {10.48550/ARXIV.2405.17428}, + eprint = {2405.17428}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + copyright = {Creative Commons Attribution 4.0 International}, + file = {:Lee2024b - NV Embed_ Improved Techniques for Training LLMs As Generalist Embedding Models.pdf:PDF:http\://arxiv.org/pdf/2405.17428v3}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Information Retrieval (cs.IR), Machine Learning (cs.LG), FOS: Computer and information sciences}, + publisher = {arXiv}, + year = {2024}, +} + +@InProceedings{Cai2021, + author = {Xingyu Cai and Jiaji Huang and Yuchen Bian and Kenneth Church}, + booktitle = {International Conference on Learning Representations}, + title = {Isotropy in the Contextual Embedding Space: Clusters and Manifolds}, + url = {https://openreview.net/forum?id=xYGNO86OWDH}, + file = {:Cai2021 - Isotropy in the Contextual Embedding Space_ Clusters and Manifolds.pdf:PDF}, + year = {2021}, +} + +@Article{Razzhigaev2023, + author = {Razzhigaev, Anton and Mikhalchuk, Matvey and Goncharova, Elizaveta and Oseledets, Ivan and Dimitrov, Denis and Kuznetsov, Andrey}, + date = {2023-11-10}, + title = {The Shape of Learning: Anisotropy and Intrinsic Dimensions in Transformer-Based Models}, + doi = {10.48550/ARXIV.2311.05928}, + eprint = {2311.05928}, + eprintclass = {cs.CL}, + eprinttype = {arXiv}, + abstract = {In this study, we present an investigation into the anisotropy dynamics and intrinsic dimension of embeddings in transformer architectures, focusing on the dichotomy between encoders and decoders. Our findings reveal that the anisotropy profile in transformer decoders exhibits a distinct bell-shaped curve, with the highest anisotropy concentrations in the middle layers. This pattern diverges from the more uniformly distributed anisotropy observed in encoders. In addition, we found that the intrinsic dimension of embeddings increases in the initial phases of training, indicating an expansion into higher-dimensional space. Which is then followed by a compression phase towards the end of training with dimensionality decrease, suggesting a refinement into more compact representations. Our results provide fresh insights to the understanding of encoders and decoders embedding properties.}, + copyright = {Creative Commons Zero v1.0 Universal}, + file = {:Razzhigaev2023 - The Shape of Learning_ Anisotropy and Intrinsic Dimensions in Transformer Based Models.pdf:PDF:http\://arxiv.org/pdf/2311.05928v2}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Information Theory (cs.IT), Machine Learning (cs.LG), General Topology (math.GN), FOS: Computer and information sciences, FOS: Mathematics}, + publisher = {arXiv}, + year = {2023}, +} + +@Comment{jabref-meta: databaseType:biblatex;} + +@Comment{jabref-meta: fileDirectoryLatex-tobias-port-4114:/home/nauen/cloud/JobDFKI;} + +@Comment{jabref-meta: fileDirectoryLatex-tobias-tobias-MS-7C37:/data/cloud/JobDFKI;} + +@Comment{jabref-meta: grouping: +0 AllEntriesGroup:; +1 StaticGroup:Datasets\;0\;0\;0xff00ffff\;\;\;; +1 StaticGroup:Coreset for FL\;0\;1\;0xe5e500ff\;\;\;; +1 StaticGroup:Dataset Distillation Survey\;0\;0\;0x003333ff\;\;Citations for our distillation survey paper\;; +2 StaticGroup:Condensed Dataset\;0\;0\;0x8a8a8aff\;\;\;; +2 StaticGroup:Importance Sampling\;0\;1\;0x8a8a8aff\;\;\;; +2 StaticGroup:Pruning\;0\;1\;0x8a8a8aff\;\;\;; +2 StaticGroup:Noisy Labels\;0\;1\;0x8a8a8aff\;\;\;; +2 StaticGroup:Surveys\;0\;1\;0x8a8a8aff\;\;\;; +1 StaticGroup:WTF Benchmark\;0\;1\;0xb31a1aff\;\;\;; +1 StaticGroup:Reading Group Potential\;0\;0\;0x1a3399ff\;\;\;; +1 StaticGroup:ForAug\;0\;1\;0xcc6633ff\;\;\;; +} diff --git a/main.brf b/main.brf index f7e7a01..356bf26 100644 --- a/main.brf +++ b/main.brf @@ -25,10 +25,8 @@ \backcite {Zong2022}{{1}{1}{figure.caption.1}} \backcite {Shorten2019}{{1}{1}{figure.caption.1}} \backcite {Xu2023d}{{1}{1}{figure.caption.1}} -\backcite {Alomar2023}{{2}{1}{figure.caption.1}} \backcite {Ding2023a}{{2}{1}{figure.caption.1}} \backcite {RojasGomez2023}{{2}{1}{figure.caption.1}} -\backcite {Kolesnikov2020}{{2}{1}{figure.caption.1}} \backcite {Ren2024}{{2}{1}{figure.caption.1}} \backcite {Sun2024}{{2}{1}{figure.caption.1}} \backcite {Suvorov2021}{{2}{1}{figure.caption.1}} @@ -50,66 +48,67 @@ \backcite {Hinterstoisser2019}{{2}{2}{section*.4}} \backcite {Dwibedi2017}{{2}{2}{section*.4}} \backcite {Ge2023}{{2}{2}{section*.4}} -\backcite {Werman2021}{{2}{2}{section*.4}} -\backcite {Hendrycks2019}{{3}{2}{section*.5}} -\backcite {Hendrycks2019}{{3}{2}{section*.5}} -\backcite {Li2023e}{{3}{2}{section*.5}} -\backcite {Zhang2024f}{{3}{2}{section*.5}} -\backcite {Geirhos2018}{{3}{2}{section*.5}} -\backcite {Xiao2020}{{3}{2}{section*.5}} -\backcite {Sun2024}{{3}{3}{section*.7}} -\backcite {Ren2024}{{3}{3}{section*.7}} -\backcite {Liu2023e}{{3}{3}{section*.7}} -\backcite {Kirillov2023}{{3}{3}{section*.7}} -\backcite {Suvorov2021}{{3}{3}{section*.7}} -\backcite {Sun2024}{{3}{3}{section*.7}} -\backcite {Bates1955}{{4}{3}{equation.3.2}} -\backcite {Touvron2022}{{4}{3}{equation.3.2}} +\backcite {Kang2022}{{2}{2}{section*.4}} +\backcite {Hendrycks2019}{{2}{2}{section*.6}} +\backcite {Hendrycks2019}{{2}{2}{section*.6}} +\backcite {Li2023e}{{2}{2}{section*.6}} +\backcite {Zhang2024f}{{2}{2}{section*.6}} +\backcite {Geirhos2018}{{3}{2}{section*.6}} +\backcite {Xiao2020}{{3}{2}{section*.6}} +\backcite {Sun2024}{{3}{3.1}{subsection.3.1}} +\backcite {Ren2024}{{3}{3.1}{subsection.3.1}} +\backcite {Liu2023e}{{3}{3.1}{subsection.3.1}} +\backcite {Kirillov2023}{{3}{3.1}{subsection.3.1}} +\backcite {Suvorov2021}{{3}{3.1}{subsection.3.1}} +\backcite {Sun2024}{{3}{3.1}{subsection.3.1}} +\backcite {Touvron2022}{{4}{3.2}{subsection.3.2}} +\backcite {Suvorov2021}{{4}{1}{table.caption.7}} +\backcite {Suvorov2021}{{4}{1}{table.caption.7}} +\backcite {Sun2024}{{4}{1}{table.caption.7}} \backcite {Le2015}{{4}{4.1}{subsection.4.1}} -\backcite {Suvorov2021}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Suvorov2021}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Suvorov2021}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{\caption@xref {??}{ on input line 65}}{table.caption.9}} -\backcite {Sun2024}{{5}{4.1}{table.caption.9}} -\backcite {Suvorov2021}{{5}{4.1}{table.caption.9}} -\backcite {Bates1955}{{5}{4.1}{table.caption.11}} -\backcite {Jonhson1995}{{6}{4.1}{table.caption.11}} -\backcite {Nauen2023}{{6}{4.2}{table.caption.13}} -\backcite {Touvron2022}{{6}{4.2}{table.caption.13}} -\backcite {Dosovitskiy2021}{{6}{4.2}{table.caption.13}} -\backcite {Liu2021}{{6}{4.2}{table.caption.13}} -\backcite {He2016}{{6}{4.2}{table.caption.13}} -\backcite {Maji2013}{{6}{4.2}{table.caption.14}} -\backcite {Dehghan2017}{{6}{4.2}{table.caption.14}} -\backcite {Nilsback2008}{{6}{4.2}{table.caption.14}} -\backcite {Kaur2017}{{6}{4.2}{table.caption.14}} -\backcite {Parkhi2012}{{6}{4.2}{table.caption.14}} -\backcite {Chattopadhay2018}{{7}{4.3}{table.caption.18}} -\backcite {Selvaraju2016}{{7}{4.3}{table.caption.18}} -\backcite {Sundararajan2017}{{7}{4.3}{table.caption.18}} -\backcite {Selvaraju2016}{{7}{4.3}{table.caption.18}} -\backcite {Chattopadhay2018}{{7}{4.3}{table.caption.18}} -\backcite {Sundararajan2017}{{7}{4.3}{table.caption.18}} -\backcite {You2020}{{12}{\caption@xref {??}{ on input line 21}}{table.caption.25}} -\backcite {Touvron2022}{{12}{\caption@xref {??}{ on input line 22}}{table.caption.25}} -\backcite {Nauen2023}{{12}{A}{table.caption.26}} -\backcite {Touvron2022}{{12}{A}{table.caption.26}} -\backcite {Nauen2023}{{12}{A}{table.caption.26}} +\backcite {Sun2024}{{4}{4.1}{table.caption.8}} +\backcite {Suvorov2021}{{4}{4.1}{table.caption.8}} +\backcite {Suvorov2021}{{5}{2}{table.caption.8}} +\backcite {Bates1955}{{5}{4.1}{table.caption.9}} +\backcite {Nauen2025}{{5}{4.2}{table.caption.11}} +\backcite {Touvron2022}{{5}{4.2}{table.caption.11}} +\backcite {Dosovitskiy2021}{{5}{4.2}{table.caption.11}} +\backcite {Liu2021}{{5}{4.2}{table.caption.11}} +\backcite {He2016}{{5}{4.2}{table.caption.11}} +\backcite {Touvron2021b}{{5}{4.2}{table.caption.11}} +\backcite {Ge2023}{{6}{4.2}{table.caption.12}} +\backcite {Ghiasi2020}{{6}{4.2}{table.caption.12}} +\backcite {Shermaine2025}{{6}{4.2}{table.caption.12}} +\backcite {Maji2013}{{6}{4.2}{table.caption.13}} +\backcite {Dehghan2017}{{6}{4.2}{table.caption.13}} +\backcite {Nilsback2008}{{6}{4.2}{table.caption.13}} +\backcite {Kaur2017}{{6}{4.2}{table.caption.13}} +\backcite {Parkhi2012}{{6}{4.2}{table.caption.13}} +\backcite {Xiao2020}{{7}{4.3}{figure.caption.14}} +\backcite {Wang2024f}{{7}{4.3}{figure.caption.14}} +\backcite {Chattopadhay2018}{{7}{4.3}{figure.caption.15}} +\backcite {Selvaraju2016}{{7}{4.3}{figure.caption.15}} +\backcite {Sundararajan2017}{{7}{4.3}{figure.caption.15}} +\backcite {Selvaraju2016}{{7}{4.3}{figure.caption.15}} +\backcite {Chattopadhay2018}{{7}{4.3}{figure.caption.15}} +\backcite {Sundararajan2017}{{7}{4.3}{figure.caption.15}} +\backcite {Bates1955}{{12}{A}{figure.caption.20}} +\backcite {Jonhson1995}{{12}{A}{figure.caption.20}} +\backcite {You2020}{{13}{9}{table.caption.23}} +\backcite {Touvron2022}{{13}{9}{table.caption.23}} +\backcite {Touvron2021b}{{13}{9}{table.caption.23}} +\backcite {Yun2019}{{13}{9}{table.caption.23}} +\backcite {Zhong2017}{{13}{9}{table.caption.23}} +\backcite {Cubuk2019}{{13}{9}{table.caption.23}} +\backcite {Zhang2018a}{{13}{9}{table.caption.23}} +\backcite {Yun2019}{{13}{9}{table.caption.23}} +\backcite {Nauen2025}{{13}{C}{table.caption.24}} +\backcite {Touvron2022}{{13}{C}{table.caption.24}} +\backcite {Touvron2021b}{{13}{C}{table.caption.24}} +\backcite {Nauen2025}{{13}{C}{table.caption.24}} +\backcite {Paszke2019}{{13}{C}{table.caption.24}} +\backcite {Wightman2019}{{13}{C}{table.caption.24}} +\backcite {Suvorov2021}{{14}{D}{table.caption.26}} +\backcite {Sun2024}{{14}{D}{table.caption.26}} +\backcite {Ren2024}{{15}{E}{figure.caption.28}} +\backcite {Ren2024}{{17}{7}{figure.caption.28}} diff --git a/main.pdf b/main.pdf index 85ac8ba..15db48e 100644 Binary files a/main.pdf and b/main.pdf differ diff --git a/main.tex b/main.tex index 58ba1f8..a3c9704 100644 --- a/main.tex +++ b/main.tex @@ -1,14 +1,11 @@ -% ICCV 2025 Paper Template +% CVPR 2026 Paper Template; see https://github.com/cvpr-org/author-kit \documentclass[10pt,twocolumn,letterpaper]{article} %%%%%%%%% PAPER TYPE - PLEASE UPDATE FOR FINAL VERSION -% \usepackage{iccv} % To produce the CAMERA-READY version -% \usepackage[review]{iccv} % To produce the REVIEW version -\usepackage[pagenumbers]{iccv} % To force page numbers, e.g. for an arXiv version - -% Import additional packages in the preamble file, before hyperref -\input{packages} +% \usepackage{cvpr} % To produce the CAMERA-READY version +% \usepackage[review]{cvpr} % To produce the REVIEW version +\usepackage[pagenumbers]{cvpr} % To force page numbers, e.g. for an arXiv version % It is strongly recommended to use hyperref, especially for the review version. % hyperref with option pagebackref eases the reviewers' job. @@ -17,27 +14,32 @@ % % If you comment hyperref and then uncomment it, you should delete *.aux before re-running LaTeX. % (Or just hit 'q' on the first LaTeX run, let it finish, and you should be clear). -\definecolor{iccvblue}{rgb}{0.21,0.49,0.74} -\usepackage[pagebackref,breaklinks,colorlinks,allcolors=iccvblue]{hyperref} -\usepackage[capitalize,noabbrev]{cleveref} +\definecolor{cvprblue}{rgb}{0.21,0.49,0.74} +\usepackage[pagebackref,breaklinks,colorlinks,allcolors=cvprblue]{hyperref} +\input{packages} %%%%%%%%% PAPER ID - PLEASE UPDATE -\def\paperID{6426} % *** Enter the Paper ID here -\def\confName{ICCV} -\def\confYear{2025} - -\newcommand{\name}{\textit{ForNet}\xspace} -\newcommand{\schemename}{\textit{ForAug}\xspace} -% Names: RecombiNet, RecombNet, ReMix, ReMixNet, FoReMix/ForeMix +\def\paperID{4792} % *** Enter the Paper ID here +\def\confName{CVPR} +\def\confYear{2026} %%%%%%%%% TITLE - PLEASE UPDATE -\title{\schemename: Recombining Foregrounds and Backgrounds to Improve Vision Transformer Training with Bias Mitigation} +\newcommand{\name}{\textit{ForNet}\xspace} +\newcommand{\schemename}{\textit{ForAug}\xspace} +\title{\schemename: Mitigating Biases and Improving Vision Transformer Training by Recombining Foregrounds and Backgrounds} +% \title{\schemename: Mitigating Biases and Improving ViT Training by Recombining Foregrounds and Backgrounds} +% \title{\LaTeX\ Author Guidelines for \confName~Proceedings} %%%%%%%%% AUTHORS - PLEASE UPDATE -\author{Tobias Christian Nauen${}^{1,2}$ Brian Moser${}^2$ Federico Raue${}^2$ Stanislav Frolov${}^2$ Andreas Dengel${}^{1,2}$\\ -${}^1$RPTU Kaiserslautern-Landau, Kaiserslautern, Germany \\ -${}^2$German Research Center for Artificial Intelligence (DFKI), Kaiserslautern, Germany \\ -{\tt\small first\_second.last@dfki.de / first.last@dfki.de} +\author{ + Tobias Christian Nauen\textsuperscript{\rm 1,\rm 2}, + Brian Moser\textsuperscript{\rm 2}, + Federico Raue\textsuperscript{\rm 2}, + Stanislav Frolov\textsuperscript{\rm 2}, + Andreas Dengel\textsuperscript{\rm 1,\rm 2} \\ + \textsuperscript{\rm 1}RPTU University Kaiserslautern-Landau, Kaiserslautern, Germany \\ + \textsuperscript{\rm 2}German Research Center for Artificial Intelligence (DFKI), Kaiserslautern, Germany \\ + {\tt\small first\_second.last@dfki.de / first.last@dfki.de} % For a paper whose authors are all at the same institution, % omit the following lines up until the closing ``}''. % Additional authors and addresses can be added with ``\and'', @@ -55,16 +57,15 @@ ${}^2$German Research Center for Artificial Intelligence (DFKI), Kaiserslautern, % \input{sec/future_work} \input{sec/conclusion} \input{sec/acks} - { \small \bibliographystyle{ieeenat_fullname} - \bibliography{../../JabRef/main_bib} + \bibliography{main} } -\newpage -\onecolumn +% WARNING: do not forget to delete the supplementary pages from your submission \appendix +\onecolumn \input{sec/appendix} \end{document} diff --git a/packages.tex b/packages.tex index 44dc74f..858f99a 100644 --- a/packages.tex +++ b/packages.tex @@ -1,14 +1,11 @@ -% \usepackage{color} +\usepackage{color} % \usepackage{hyperref} -% if you use cleveref.. -% \usepackage[capitalize,noabbrev]{cleveref} - - % my own set of packages \usepackage{amssymb} \usepackage{amsfonts} \usepackage{amsmath} +\usepackage[capitalize,noabbrev]{cleveref} %\usepackage{tabu} \usepackage{amsxtra} \usepackage{cancel} @@ -29,7 +26,8 @@ \usepackage{textcomp} %\usepackage[defaultlines=3,all]{nowidow} \usepackage{float} -%\usepackage{xcolor} +\usepackage{placeins} +\usepackage{xcolor} \usepackage{pdflscape} \usepackage{csquotes} %\usepackage{setspace} @@ -54,6 +52,7 @@ \usepackage{siunitx} \usepackage{booktabs} \usepackage{microtype} +\usepackage{footmisc} % Mathshortcuts \DeclareMathSymbol{\mlq}{\mathord}{operators}{``} @@ -133,8 +132,12 @@ \newcommand{\ops}{\operatorname{ops}} \newcommand{\entr}{\operatorname{entries}} \newcommand{\gtxt}[1]{\text{\textcolor{gray}{#1}}} +\definecolor{DarkGreen}{RGB}{34,149,34} \newcommand{\grntxt}[1]{\text{\textcolor{ForestGreen}{#1}}} +\newcommand{\rdtxt}[1]{\text{\textcolor{red}{#1}}} \newcommand{\code}[1]{\texttt{#1}} +\newcommand{\cmark}{\ding{51}}% +\newcommand{\xmark}{\ding{55}}% \newcommand*\rot{\rotatebox{90}} \newcommand{\tldr}{\textbf{TL;DR:}\xspace} diff --git a/preamble.tex b/preamble.tex new file mode 100644 index 0000000..157b4fd --- /dev/null +++ b/preamble.tex @@ -0,0 +1,51 @@ +%% This file contains a number of tweaks that are typically applied to the main document. +%% They are not enabled by default, but can be enabled by uncommenting the relevant lines. + +%% +%% Inline annotations; for predefined colors, refer to "dvipsnames" in the xcolor package: +%% https://tinyurl.com/overleaf-colors +%% +\newcommand{\red}[1]{{\color{red}#1}} +\newcommand{\todo}[1]{{\color{red}#1}} +\newcommand{\TODO}[1]{\textbf{\color{red}[TODO: #1]}} +%% +%% disable for camera ready / submission by uncommenting these lines +%% +% \renewcommand{\TODO}[1]{} +% \renewcommand{\todo}[1]{#1} + +%% +%% work harder in optimizing text layout. Typically shrinks text by 1/6 of page, enable +%% it at the very end of the writing process, when you are just above the page limit +%% +% \usepackage{microtype} + +%% +%% fine-tune paragraph spacing +%% +% \renewcommand{\paragraph}[1]{\vspace{.5em}\noindent\textbf{#1.}} + +%% +%% globally adjusts space between figure and caption +%% +% \setlength{\abovecaptionskip}{.5em} + + +%% +%% Allows "the use of \paper to refer to the project name" +%% with automatic management of space at the end of the word +%% +% \usepackage{xspace} +% \newcommand{\paper}{ProjectName\xspace} + +%% +%% Commonly used math definitions +%% +% \DeclareMathOperator*{\argmin}{arg\,min} +% \DeclareMathOperator*{\argmax}{arg\,max} + +%% +%% Tigthen underline +%% +% \usepackage{soul} +% \setuldepth{foobar} \ No newline at end of file diff --git a/sec/abstract.tex b/sec/abstract.tex index 759080a..89166c7 100644 --- a/sec/abstract.tex +++ b/sec/abstract.tex @@ -2,14 +2,18 @@ \begin{abstract} Transformers, particularly Vision Transformers (ViTs), have achieved state-of-the-art performance in large-scale image classification. - However, they often require large amounts of data and can exhibit biases that limit their robustness and generalizability. - This paper introduces \schemename, a novel data augmentation scheme that addresses these challenges and explicitly includes inductive biases, which commonly are part of the neural network architecture, into the training data. + However, they often require large amounts of data and can exhibit biases, such as center or size bias, that limit their robustness and generalizability. + This paper introduces \schemename, a novel data augmentation operation that addresses these challenges by explicitly imposing invariances into the training data, which are otherwise part of the neural network architecture. % This paper introduces \name, a novel dataset derived from ImageNet that addresses these challenges. - \schemename is constructed by using pretrained foundation models to separate and recombine foreground objects with different backgrounds, enabling fine-grained control over image composition during training. - It thus increases the data diversity and effective number of training samples. - We demonstrate that training on \name, the application of \schemename to ImageNet, significantly improves the accuracy of ViTs and other architectures by up to 4.5 percentage points (p.p.) on ImageNet and 7.3 p.p. on downstream tasks. - Importantly, \schemename enables novel ways of analyzing model behavior and quantifying biases. - Namely, we introduce metrics for background robustness, foreground focus, center bias, and size bias and show that training on \name substantially reduces these biases compared to training on ImageNet. + \schemename is constructed by using pretrained foundation models to separate and recombine foreground objects with different backgrounds. + % enabling fine-grained control over image composition during training. + % Missing sentence here of how you use it to generate data in what way and with what purpose wrt to bias + This recombination step enables us to take fine-grained control over object position and size, as well as background selection. + % It thus increases the data diversity and effective number of training samples. + We demonstrate that using \schemename significantly improves the accuracy of ViTs and other architectures by up to 4.5 percentage points (p.p.) on ImageNet, which translates to 7.3 p.p. on downstream tasks. + % Importantly, \schemename enables novel ways of analyzing model behavior and quantifying biases. + Importantly, \schemename not only improves accuracy but also opens new ways to analyze model behavior and quantify biases. + Namely, we introduce metrics for background robustness, foreground focus, center bias, and size bias and show that using \schemename during training substantially reduces these biases. In summary, \schemename provides a valuable tool for analyzing and mitigating biases, enabling the development of more robust and reliable computer vision models. - Our code and dataset are publicly available at \url{https://github.com/tobna/ForAug}. + Our code and dataset are publicly available at \code{https://github.com/tobna/ForAug}. \end{abstract} \ No newline at end of file diff --git a/sec/acks.tex b/sec/acks.tex index e0cee0f..372778e 100644 --- a/sec/acks.tex +++ b/sec/acks.tex @@ -2,5 +2,7 @@ \subsection*{Acknowledgements} \label{sec:acknowledgements} -This work was funded by the Carl-Zeiss Foundation under the Sustainable Embedded AI project (P2021-02-009) and by the EU project SustainML (Horizon Europe grant agreement No 101070408). -All compute was done thanks to the Pegasus cluster at DFKI. +% Will be in the final paper. + +This work was funded by the Carl-Zeiss Foundation under the Sustainable Embedded AI project (P2021-02-009). by the EU project SustainML (Horizon Europe grant agreement No 101070408) and by the BMFTR project Albatross (funding code 16IW24002). +All compute was done thanks to the Pegasus cluster at DFKI Kaiserslautern. diff --git a/sec/appendix.tex b/sec/appendix.tex index 074315e..78e45d6 100644 --- a/sec/appendix.tex +++ b/sec/appendix.tex @@ -1,57 +1,154 @@ -% !TeX root = ../supplementary.tex +% !TeX root = ../main.tex + +\section{Extended Bates Distribution} +\begin{figure}[h!] + \centering + \includegraphics[width=.5\columnwidth]{img/bates.pdf} + \caption{Plot of the probability distribution function (PDF) of the extended Bates distribution for different parameters $\eta$. Higher values of $\eta$ concentrate the distribution around the center.} + \label{fig:bates-pdf} +\end{figure} + +% Finally, we analyze the foreground object's positioning in the image. +% We utilize an extended Bates distribution to sample the position of the foreground object. +% The Bates distribution~\cite{Bates1955} with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}. +% Therefore, the larger $\eta$, the more concentrated the distribution is around the center. +% We extend this concept to $\eta \leq -1$ by shifting the distribution away from the center and towards the edges. +% We extend this concept to $\eta \leq -1$ by defining +% \begin{align*} +% X \sim \text{Bates}(\eta) :\Leftrightarrow s(X) \sim \text{Bates}(-\eta) +% \end{align*} +% for $\eta \leq 1$ with $s$ being the sawtooth function on $[0, 1]$: +% \begin{align} +% s(x) = \begin{cases} +% x + 0.5 & \text{if } 0 < x < 0.5 \\ +% x - 0.5 & \text{if } 0.5 \leq x \leq 1 +% \end{cases} +% \end{align} +% Note that $s \circ s = \id$ on $[0, 1]$. +% This way, distributions with $\eta \leq -1$ are more concentrated around the borders. +% $\eta = 1$ and $\eta = -1$ both correspond to the uniform distribution. +% The PDF of this extended Bates distribution is visualized in \Cref{fig:bates-pdf}. + +We introduce an extension of the Bates distribution~\cite{Bates1955} to include negative parameters, enabling sampling of foreground object positions away from the image center. +The standard Bates distribution, for $\eta \in \N$, is defined as the mean of $\eta$ independent random variables drawn from a uniform distribution \cite{Jonhson1995}. +A larger $\eta$ value increases the concentration of samples around the distribution's mean, which in this case is the image center. + +To achieve an opposite effect--concentrating samples at the image borders--we extend the distribution to $\eta \leq 1$. +\begin{align*} + X \sim \text{Bates}(\eta) :\Leftrightarrow s(X) \sim \text{Bates}(-\eta) +\end{align*} +This is accomplished by sampling from a standard Bates distribution with parameter $-\eta \geq 1$ and then applying a sawtooth function. +The sawtooth function on the interval $[0,1]$ is defined as +\begin{align} + s(x) = \begin{cases} + x + 0.5 & \text{if } 0 < x < 0.5 \\ + x - 0.5 & \text{if } 0.5 \leq x \leq 1 + \end{cases} +\end{align} +This function effectively maps the central portion of the interval to the edges and the edge portions to the center. +For example, a value of 0.3 (central-left) is mapped to 0.8 (edge-right), while 0.8 (edge-right) is mapped to 0.3 (central-left). +This transformation inverts the distribution's concentration, shifting the probability mass from the center to the borders. +We visualize the distribution function of the extended Bates distribution in \Cref{fig:bates-pdf}. +Both $\eta = 1$ and $\eta = -1$ result in a uniform distribution across the image. + +\section{Resource Usage of \schemename} +To utilize the proposed \schemename, specific computational resources are necessary, particularly for computing and storing for the output of the segmentation stage and for on-the-fly processing of the recombination stage. + +\paragraph{Segmentation.} +% While calculating the segmentations and infills takes a lot of compute, this is effort that has to be spent only once per dataset. +\schemename involves a computationally expensive segmentation and infill stage, which is a one-time calculation per dataset. +Once computed, the segmentation and infill results can be perpetually reused, amortizing the initial cost over all subsequent experiments and applications. +On NVIDIA H100 GPUs, the segmentation stage will compute at a rate of $374.3 \frac{\text{img}}{\text{GPU} \times \text{h}}$ when using Attentive Eraser or $5 338.6 \frac{\text{img}}{\text{GPU} \times \text{h}}$ for LaMa. +For ImageNet this comes down to just under 9 days (Attentive Eraser) or 16 hours (LaMa) on two 8 GPU nodes. +To facilitate immediate use and reproduction of results, we publicly provide the precalculated segmentation stage output for the ImageNet dataset for download\footnote{Link will go here.}. +The output of \schemename's segmentation step on ImageNet dataset requires 73 GB of additional disk space for the segmentation output, which is separate from the base 147 GB ImageNet size. + +\paragraph{Recombination.} +The recombination step of \schemename is implemented as a based data loader operation. +It's thus offloaded to the CPU, where it can be heavily parallelized and thus only results in a very minor increase in the training step-time. +For example, using a ViT-B model on an NVIDIA A100 GPU, the average update step-time increased by $1\%$, from $528 \pm 2$ ms to $534 \pm 1$ ms. + \section{Training Setup} \label{sec:training_setup} -\begin{table}[h] +\begin{table*}[h!] \centering - \begin{tabular}{lc} - \toprule - Parameter & Value \\ - \midrule - Image Resolution & $224 \times 224$ \\ - Epochs & 300 \\ - Learning Rate & 3e-3 \\ - Learning Rate Schedule & cosine decay \\ - Batch Size & 2048 \\ - Warmup Schedule & linear \\ - Warmup Epochs & 3 \\ - Weight Decay & 0.02 \\ - Label Smoothing & 0.1 \\ - Optimizer & Lamb \cite{You2020} \\ - Data Augmentation Policy & 3-Augment \cite{Touvron2022} \\ - \bottomrule - \end{tabular} - \caption{Training setup for our ImageNet and \name training.} + \caption{Training setup and hyperparameters for our ImageNet training.} \label{tab:in-setup} -\end{table} -\begin{table}[h] - \centering - \begin{tabular}{lccc} + \begin{tabular}{lcc} \toprule - Dataset & Batch Size & Epochs & Learning Rate \\ + Parameter & ViT, Swin, ResNet & DeiT \\ \midrule - Aircraft & 512 & 500 & 3e-4 \\ - Cars & 1024 & 500 & 3e-4 \\ - Flowers & 256 & 500 & 3e-4 \\ - Food & 2048 & 100 & 3e-4 \\ - Pets & 512 & 500 & 3e-4 \\ + Image Resolution & $224 \times 224$ & $224 \times 224$ \\ + Epochs & 300 & 300 \\ + Learning Rate & 3e-3 & S/B: 1e-3, L: 5e-4 \\ + Learning Rate Schedule & cosine decay & cosine decay \\ + Batch Size & 2048 & 1024 \\ + GPUs & $4\times$ NVIDIA A100/H100/H200 & $4\times$ NVIDIA A100/H100/H200 \\ + Warmup Schedule & linear & linear \\ + Warmup Epochs & 3 & 3 \\ + Weight Decay & 0.02 & 0.05 \\ + Label Smoothing & 0.1 & 0.1 \\ + Optimizer & Lamb \cite{You2020} & AdamW \\ + \cmidrule(r){1-1} + Data Augmentation Policy & \textbf{3-Augment \cite{Touvron2022}} & \textbf{DeiT \cite{Touvron2021b}} \\ + Augmentations & \makecell{Resize \\ RandomCrop \\ HorizontalFlip \\ Grayscale \\ Solarize \\ GaussianBlur \\ ColorJitter \\ CutMix \cite{Yun2019}} & \makecell{RandomResizedCrop \\ HorizontalFlip \\ RandomErase \cite{Zhong2017} \\ RandAugment \cite{Cubuk2019} \\ ColorJitter \\ Mixup \cite{Zhang2018a} \\ CutMix \cite{Yun2019}} \\ \bottomrule \end{tabular} - \caption{Training setup for finetuning on different downstream datasets. Other settings are the same as in \Cref{tab:in-setup}.} - \label{tab:downstream-setup} -\end{table} +\end{table*} -On ImageNet we use the same training setup as \cite{Nauen2023} and \cite{Touvron2022} without pretraining. -As our focus is on evaluating the changes in accuracy due to \schemename/\name, like \cite{Nauen2023}, we stick to one set of hyperparameters for all models. -We list the settings used for training on ImageNet and \name in \Cref{tab:in-setup} and the ones used for finetuning those weights on the downstream datasets in \Cref{tab:downstream-setup}. - -\newpage -\section{Infill Model Comparison} -\label{sec:infill-model-comparison} \begin{table}[h!] \centering - \resizebox{\textwidth}{!}{\begin{tabular}{cc@{\hskip 0.3in}cc} + \caption{Training setup for finetuning on different downstream datasets. Other settings are the same as in \Cref{tab:in-setup}. For finetuning, we always utilize 3-Augment and the related parameters from the \emph{ViT, Swin, ResNet} column of \Cref{tab:in-setup}} + \label{tab:downstream-setup} + \begin{tabular}{lcccc} + \toprule + Dataset & Batch Size & Epochs & Learning Rate & Num. GPUs \\ + \midrule + Aircraft & 512 & 500 & 3e-4 & 2 \\ + Cars & 1024 & 500 & 3e-4 & 4 \\ + Flowers & 256 & 500 & 3e-4 & 1 \\ + Food & 2048 & 100 & 3e-4 & 4 \\ + Pets & 512 & 500 & 3e-4 & 2 \\ + \bottomrule + \end{tabular} +\end{table} +On ImageNet we use the same training setup as \cite{Nauen2025} and \cite{Touvron2022} without pretraining for ViT, Swin, and ResNet. +For DeiT, we train the same ViT architecture but using the data augmentation scheme and hyperparameters from \cite{Touvron2021b}. +As our focus is on evaluating the changes in accuracy due to \schemename, like \cite{Nauen2025}, we stick to one set of hyperparameters for all models. +We list the settings used for training on ImageNet in \Cref{tab:in-setup} and the ones used for finetuning those weights on the downstream datasets in \Cref{tab:downstream-setup}. +Out implementation is using PyTorch \cite{Paszke2019} and the \emph{timm} library \cite{Wightman2019} for model architectures and basic functions. + +\begin{table*}[h!] + \centering + \caption{Hardware and Software specifics used for both training and evaluation.} + \label{tab:hw-sw-versions} + \begin{tabular}{ll} + \toprule + Parameter & Value \\ + \midrule + GPU & NVIDIA A100/H100/H200 \\ + CPU & 24 CPU cores (Intex Xenon) per GPU \\ + Memory & up to 120GB per GPU \\ + Operating System & Enroot container for SLURM based on Ubuntu 24.04 LTS \\ + Python & 3.12.3 \\ + PyTorch & 2.7.0 \\ + TorchVision & 0.22.0 \\ + Timm & 1.0.15 \\ + \bottomrule + \end{tabular} +\end{table*} +\Cref{tab:hw-sw-versions} lists the specific hardware we use, as well as versions of the relevant software packages. + + +\section{Infill Model Comparison} +\begin{table*}[h!] + \centering + \caption{Example infills of LaMa and Attentive Eraser.} + \label{tab:infill-examples} + \resizebox{.9\textwidth}{!}{ + \begin{tabular}{cc@{\hskip 0.3in}cc} \toprule LaMa & Att. Eraser & LaMa & Att. Eraser \\ \midrule @@ -64,26 +161,47 @@ We list the settings used for training on ImageNet and \name in \Cref{tab:in-set \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00003097.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00003097.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00011629.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00011629.JPEG} \\ \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00000547.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00000547.JPEG} & \includegraphics[width=.23\columnwidth]{img/lama_infills/comp/ILSVRC2012_val_00025256.JPEG} & \includegraphics[width=.23\columnwidth]{img/att_err_infills/comp/ILSVRC2012_val_00025256.JPEG} \\ \bottomrule - \end{tabular}} - \caption{Example infills of LaMa and Attentive Eraser.} -\end{table} + \end{tabular} + } +\end{table*} +We visualize example infilled images for both LaMa \cite{Suvorov2021} and Attentive Eraser \cite{Sun2024} in \Cref{tab:infill-examples}. +We qualitatively find that while LaMa often leaves repeated textures of blurry spots where the object was erased, Attentive Eraser produces slightly cleaner and more coherent infills of the background. -\section{Images with High Infill Ratio} -\label{sec:high-infill-ratio} -\begin{table}[h!] +\newpage +\section{Image Infill Ratio} +\begin{table*}[h!] \centering - \begin{tabular}{ccc} - \toprule - Infill Ratio & LaMa & Att. Eraser \\ - \midrule - 93.7 & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} \\ \\ - 95.7 & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} \\ \\ - 83.7 & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} \\ \\ - 88.2 & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00026530.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00026530.JPEG}} - \end{tabular} \caption{Example infills with a large relative foreground area size that is infilled (infill ratio).} \label{tbl:high-rat} -\end{table} + \resizebox{.8\textwidth}{!}{ + \begin{tabular}{ccc} + \toprule + Infill Ratio & LaMa & Att. Eraser \\ + \midrule + 93.7 & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00003735.JPEG}} \\ \\ + 95.7 & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} & \raisebox{-60pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00012151.JPEG}} \\ \\ + 83.7 & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00022522.JPEG}} \\ \\ + 88.2 & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/lama_infills/high_rat/ILSVRC2012_val_00026530.JPEG}} & \raisebox{-50pt}{\includegraphics[width=.3\columnwidth]{img/att_err_infills/high_rat/ILSVRC2012_val_00026530.JPEG}} + \end{tabular}} +\end{table*} + +\begin{figure} + \centering + \includegraphics[width=.9\textwidth]{img/infill_distr.pdf} + \caption{We plot the distribution of the relative size of the detected foreground object that is infilled in our Segmentation step of ImageNet. + While most images contain objects of smaller size, there is a peak where Grounded~SAM~\cite{Ren2024} detects almost the whole image as the foreground object. For examples of such large infills, see \Cref{tbl:high-rat}. + } + \label{fig:infill-distr} +\end{figure} + +\Cref{tbl:high-rat} shows infills for images where Grounded SAM \cite{Ren2024} marks a high percentile of the image as the foreground object (Infill Ratio), that has to be erased by the infill models. +While LaMa tends to fill those spots with mostly black or gray and textures similar to what we saw in \Cref{tab:infill-examples}, Attentive Eraser tends to create novel patterns by copying what is left of the background all over the rest of the image. +% We filter out such mostly infilled background using our background pruning hyperparameter $t_\text{prune} = 0.8$. +\Cref{fig:infill-distr} plots the distribution of infill ratios in \schemename. +While there is a smooth curve of the number of detections decreasing with the infill ratio until $\approx 90\%$, there is an additional peak at $\approx 100\%$ infill ratio. +We believe that this peak is made up of failure cases of Grounded~SAM. + +We filter out all backgrounds that have an infill ratio larger than our pruning threshold $t_\text{prune} = 0.8$, which translates to $10\%$ of backgrounds. diff --git a/sec/conclusion.tex b/sec/conclusion.tex index 27e501c..88f1ccb 100644 --- a/sec/conclusion.tex +++ b/sec/conclusion.tex @@ -4,8 +4,8 @@ \label{sec:conclusion} We introduce \schemename, a novel data augmentation scheme that facilitates improved Transformer training for image classification. -By explicitly separating and recombining foreground objects and backgrounds, \schemename enables controlled data augmentation, leading to significant performance gains on ImageNet and downstream fine-grained classification tasks. +By explicitly separating and recombining foreground objects and backgrounds, \schemename enables controlled data augmentation beyond existing image compositions, leading to significant performance gains on ImageNet and downstream fine-grained classification tasks. Furthermore, \schemename provides a powerful framework for analyzing model behavior and quantifying biases, including background robustness, foreground focus, center bias, and size bias. -Our experiments demonstrate that training on \name, the instantiation of \schemename on ImageNet, not only boosts accuracy but also significantly reduces these biases, resulting in more robust and generalizable models. +Our experiments demonstrate that training using \schemename not only boosts accuracy but also significantly reduces these biases, resulting in more robust and generalizable models. In the future, we see \schemename be also applied to other datasets and tasks, like video recognition or segmentation. \schemename's ability to both improve performance and provide insights into model behavior makes it a valuable tool for advancing CV research and developing more reliable AI systems. \ No newline at end of file diff --git a/sec/experiments.tex b/sec/experiments.tex index 638fa46..d83b68d 100644 --- a/sec/experiments.tex +++ b/sec/experiments.tex @@ -17,62 +17,96 @@ % \item Size bias % \end{itemize} -We conduct a comprehensive suit of experiments to validate the effectiveness of our approach. -We compare training on \name, the ImageNet instantiation of \schemename, to training on ImageNet for 7 different models. -Furthermore, we assess the impact of using \name for pretraining on multiple fine-grained downstream datasets. -Additionally, we use \schemename's control over the image distribution to quantify some model behaviors and biases. +We conduct a comprehensive suit of experiments to validate the effectiveness of our approach, +% We compare training on \name, the ImageNet instantiation of \schemename, to training on ImageNet for 10 different models. +comparing ImageNet-training with and without \schemename for 10 different models. +Furthermore, we assess the impact of using \schemename for pretraining on multiple fine-grained downstream datasets. +Finally, we exploit \schemename's control over the image distribution to quantify model behaviors and biases. +We always report the mean and standard deviation of three independent training runs. -\subsection{Design Choices of \schemename} +\subsection{Design Choices of ForAug} \label{sec:ablation} -We start by ablating the design choices of \schemename. -For this, we revert to TinyImageNet \cite{Le2015}, a subset of ImageNet containing 200 categories with 500 images each, and Tiny\name, a version of \schemename derived from TinyImageNet. -\Cref{tab:ablation} presents the results of these ablations. +We start by ablating the design choices of \schemename on TinyImageNet~\cite{Le2015}, a subset of ImageNet containing 200 categories with 500 images each. %, and Tiny\name, the application of \schemename to TinyImageNet. +% \Cref{tab:ablation} presents the results of these ablations. +\Cref{tab:ablation-segment} presents ablations for segmentation and \Cref{tab:ablation-recombine} for recombination. -\begin{table*}[t] +\begin{table} + \caption{Ablation of the design decisions in the segmentation phase of \schemename on TinyImageNet. + The first line is our baseline, while the other lines are using \schemename. + We use basic settings with the \emph{same} background strategy during recombination for this experiment. + } + \label{tab:ablation-segment} \centering - \resizebox{\textwidth}{!}{ - \begin{tabular}{lccccccccccccc} + \small + \resizebox{.9\columnwidth}{!}{ + \begin{tabular}{cccc} \toprule - \multirow{2}{*}{Dataset} & Detect. & Infill & FG. & Augmentation & BG. & BG. & edge & original & \multicolumn{2}{c}{TinyImageNet Accuracy} \\ - & prompt & Model & size & Order & strategy & pruning & smoothing & image mixing & ViT-Ti [\%] & ViT-S [\%] \\ - \cmidrule(r){1-1} \cmidrule(lr){2-9} \cmidrule(l){10-11} - TinyImageNet & & & & & & & & & $66.1\pm0.5$ & $68.3\pm0.7$ \\ - Tiny\name & specific & LaMa \cite{Suvorov2021} & mean & crop$\to$paste$\to$color & same & - & - & \gtxt{-} & $64.6\pm0.5$ & $70.0\pm0.6$ \\ - \gtxt{Tiny\name} & \gtxt{specific} & \gtxt{LaMa \cite{Suvorov2021}} & range & \gtxt{crop$\to$paste$\to$color} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $65.5\pm0.4$ & $71.2\pm0.5$ \\ - \gtxt{Tiny\name} & general & \gtxt{LaMa \cite{Suvorov2021}} & \gtxt{range} & \gtxt{crop$\to$paste$\to$color} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $66.4\pm0.6$ & $72.9\pm0.6$ \\ - \gtxt{Tiny\name} & \gtxt{general} & Att. Eraser \cite{Sun2024} & \gtxt{range} & \gtxt{crop$\to$paste$\to$color} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.5\pm1.2$ & $72.4\pm0.5$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & paste$\to$crop$\to$color & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.1\pm1.2$ & $72.9\pm0.5$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & 1.0 & \gtxt{-} & \gtxt{-} & $67.0\pm1.2$ & $73.0\pm0.3$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & 0.8 & \gtxt{-} & \gtxt{-} & $67.2\pm1.2$ & $72.9\pm0.8$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & 0.6 & \gtxt{-} & \gtxt{-} & $67.5\pm1.0$ & $72.8\pm0.7$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 2.0$ & \gtxt{-} & $67.2\pm0.4$ & $72.9\pm0.5$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{-} & $65.9\pm0.5$ & $72.4\pm0.6$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & $p=0.2$ & $69.8\pm0.5$ & $75.0\pm0.3$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & $p=0.33$ & $69.5\pm0.4$ & $75.2\pm1.0$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & $p=0.5$ & $70.3\pm1.0$ & $74.2\pm0.2$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & linear & $70.1\pm0.7$ & $74.9\pm0.8$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & reverse lin. & $67.6\pm0.2$ & $73.2\pm0.3$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & cos & $71.3\pm1.0$ & $75.7\pm0.8$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{cos} & $70.0\pm0.8$ & $75.5\pm0.7$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & orig. & \gtxt{0.8} & \gtxt{$\sigma_\text{max} = 4.0$} & \gtxt{cos} & $67.2\pm0.9$ & $69.9\pm1.0$ \\ - \gtxt{Tiny\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & all & \gtxt{0.8} & \gtxt{$\sigma_\text{max} = 4.0$} & \gtxt{cos} & $70.1\pm0.7$ & $77.5\pm0.6$ \\ + \multirow{2.5}{*}{\makecell{Detect. \\Prompt}} & \multirow{2.5}{*}{\makecell{Infill \\ Model}} & \multicolumn{2}{c}{TinyImageNet Accuracy [\%]} \\ + \cmidrule{3-4} + & & ViT-Ti & ViT-S \\ \midrule - \name & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & \gtxt{-} & \gtxt{cos} & - & $80.5\pm0.1$ \\ - \gtxt{\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{cos} & - & $80.7\pm0.1$ \\ - \gtxt{\name} & \gtxt{general} & \gtxt{Att. Eraser \cite{Sun2024}} & \gtxt{range} & \gtxt{paste$\to$crop$\to$color} & all & \gtxt{0.8} & \gtxt{$\sigma_\text{max} = 4.0$} & \gtxt{cos} & - & $81.3\pm0.1$ \\ + \multicolumn{2}{l}{\textbf{TinyImageNet}} & $66.1 \pm 0.5$ & $68.3 \pm 0.7$ \\ + specific & LaMa \cite{Suvorov2021} & $65.5 \pm 0.4$ & $71.2 \pm 0.5$ \\ + general & \gtxt{LaMa \cite{Suvorov2021}} & $66.4 \pm 0.6$ & $72.9 \pm 0.6$ \\ + \gtxt{general} & Att. Eraser \cite{Sun2024} & $67.5 \pm 1.2$ & $72.4 \pm 0.5$ \\ \bottomrule \end{tabular}} - \caption{Ablation of design decisions of Tiny\name on TinyImageNet and \name on ImageNet.} - \label{tab:ablation} -\end{table*} +\end{table} + +\begin{table}[t] + \caption{Ablation of the recombination phase of \schemename on TinyImageNet (top) and ImageNet (bottom). The first experiments use the initial segmentation settings with LaMa \cite{Suvorov2021}.} + \label{tab:ablation-recombine} + \centering + \resizebox{\columnwidth}{!}{ + \begin{tabular}{ccccccccccc} + \toprule + % FG. & Augment. & BG. & BG. & Edge & Original & \multicolumn{2}{c}{Accuracy [\%]} \\ + % Size & Order & Strat. & Prune & Smoothing & Mixing & ViT-Ti & ViT-S \\ + \multirow{2.5}{*}{\makecell{FG. \\size}} & \multirow{2.5}{*}{\makecell{Augment.\\Order}} & \multirow{2.5}{*}{\makecell{BG\\Strat.}} & \multirow{2.5}{*}{\makecell{BG.\\Prune}} & \multirow{2.5}{*}{\makecell{Original\\Mixing}} & \multirow{2.5}{*}{\makecell{Edge\\Smooth.}} & \multicolumn{2}{c}{Accuracy [\%]} \\ + \cmidrule{7-8} + & & & & & & ViT-Ti & ViT-S \\ + \midrule + % TinyImageNet & & & & & & & $66.1\pm0.5$ & $68.3\pm0.7$ \\ + \multicolumn{6}{l}{\textbf{TinyImageNet}} & \gtxt{$66.1\pm0.5$} & \gtxt{$68.3\pm0.7$} \\ + mean & crop$\to$paste & same & - & - & \gtxt{-} & $64.6\pm0.5$ & $70.0\pm0.6$ \\ + range & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $65.5\pm0.4$ & $71.2\pm0.5$ \\ + \midrule + % \gtxt{range} & \gtxt{crop$\to$paste} & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $66.4\pm0.6$ & $72.9\pm0.6$ \\ + {range} & {crop$\to$paste} & {same} & {-} & {-} & {-} & $67.5\pm1.2$ & $72.4\pm0.5$ \\ + \gtxt{range} & paste$\to$crop & \gtxt{same} & \gtxt{-} & \gtxt{-} & \gtxt{-} & $67.1\pm1.2$ & $72.9\pm0.5$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 1.0 & \gtxt{-} & \gtxt{-} & $67.0\pm1.2$ & $73.0\pm0.3$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.8 & \gtxt{-} & \gtxt{-} & $67.2\pm1.2$ & $72.9\pm0.8$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & 0.6 & \gtxt{-} & \gtxt{-} & $67.5\pm1.0$ & $72.8\pm0.7$ \\ + % \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 2.0$ & \gtxt{-} & $67.2\pm0.4$ & $72.9\pm0.5$ \\ + % \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $\sigma_\text{max} = 4.0$ & \gtxt{-} & $65.9\pm0.5$ & $72.4\pm0.6$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.2$ & \gtxt{-} & $69.8\pm0.5$ & $75.0\pm0.3$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.33$ & \gtxt{-} & $69.5\pm0.4$ & $75.2\pm1.0$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & $p=0.5$ & \gtxt{-} & $70.3\pm1.0$ & $74.2\pm0.2$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & linear & \gtxt{-} & $70.1\pm0.7$ & $74.9\pm0.8$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & reverse lin. & \gtxt{-} & $67.6\pm0.2$ & $73.2\pm0.3$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & cos & \gtxt{-} & $71.3\pm1.0$ & $75.7\pm0.8$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & $70.0\pm0.8$ & $75.5\pm0.7$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & orig. & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $67.2\pm0.9$ & $69.9\pm1.0$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & $70.1\pm0.7$ & $77.5\pm0.6$ \\ + \midrule + \multicolumn{6}{l}{\textbf{ImageNet}} & \gtxt{-} & \gtxt{$79.1\pm0.1$} \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & \gtxt{-} & - & $80.5\pm0.1$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & \gtxt{same} & \gtxt{0.8} & \gtxt{cos} & $\sigma_\text{max} = 4.0$ & - & $80.7\pm0.1$ \\ + \gtxt{range} & \gtxt{paste$\to$crop} & all & \gtxt{0.8} & \gtxt{cos} & \gtxt{$\sigma_\text{max} = 4.0$} & - & $81.4\pm0.1$ \\ + \bottomrule + \end{tabular}} +\end{table} + \textbf{Prompt.} % We present the ablation of our main design decisions in \Cref{tab:ablation}. First, we evaluate the type of prompt used to detect the foreground object. Here, the \emph{general} prompt, which contains the class and the more general object category, outperforms only having the class name (\emph{specific}). -\textbf{Inpainting.} Attentive Eraser \cite{Sun2024} produces superior results compared to LaMa \cite{Suvorov2021} (see \Cref{sec:infill-model-comparison} for examples). +\textbf{Inpainting.} Among inpainting models, Attentive Eraser~\cite{Sun2024} produces slightly better results compared to LaMa~\cite{Suvorov2021} ($+0.5$ p.p. on average). +For inpainting examples, see the supplementary material. +% (see the supplementary material for examples). % When comparing the infill models, the GAN-based LaMa \cite{Suvorov2021} gets outperformed by the Attentive Eraser \cite{Sun2024}. \textbf{Foreground size} @@ -86,22 +120,26 @@ This suggests that the added variability is beneficial. \textbf{Order of data augmentation.} % (1) Applying the image crop related augmentations \emph{before} pasting the foreground object and the color-based ones \emph{after} pasting or (2) applying all data augmentations after pasting the foreground object. % While results are ambiguous, we choose the second strategy, as it improves the performance of ViT-S, although not the one of ViT-Ti. -Applying all augmentations after foreground-background recombination (\emph{paste$\to$crop$\to$color}) slightly improves ViT-S's performance compared to applying crop-related augmentations before pasting (\emph{crop$\to$paste$\to$color}). -For ViT-Ti, the results are ambiguous. +Applying all augmentations after foreground-background recombination (\emph{paste$\to$crop$\to$color}) improves ViT-S's performance compared to applying crop-related augmentations before pasting (\emph{crop$\to$paste$\to$color}). +ViT-Ti results are ambiguous. \textbf{Background pruning.} -When it comes to the choice of backgrounds to use, we test two pruning thresholds ($t_\text{prune}$) to exclude backgrounds with excessive inpainting. +When it comes to the backgrounds to use, we test different pruning thresholds ($t_\text{prune}$) to exclude backgrounds with large inpainting. % and only use backgrounds with an relative size of the infilled region of at most $t_\text{prune}$ (exclusive). A threshold of $t_\text{prune}=1.0$ means that we use all backgrounds that are not fully infilled. % We find that the background pruning does not significantly impact the models' performance. % We choose $t_\text{prune}=0.8$ for the following experiments to exclude backgrounds that are mostly artificial. Varying $t_\text{prune}$ has minimal impact. -Therefore, we choose $t_\text{prune} = 0.8$ to exclude predominantly artificial backgrounds. -Similarly, applying edge smoothing to foreground masks with Gaussian blurring actually hurts performance on Tiny\name, but slightly improves it on \name. +We choose $t_\text{prune} = 0.8$ to exclude predominantly artificial backgrounds. % One of the most important design decisions is the mixing of the original dataset with \name. -\textbf{Mixing} \name with the original ImageNet data proves crucial. -While constant and linear mixing schedules improve performance over no mixing by $2-3$ p.p. compared to only using Tiny\name, the cosine annealing schedule yields the best results, boosting accuracy by another $0.5-1$ p.p. +\textbf{Mixing} \schemename-augmented samples with the original ImageNet data proves crucial. +While constant and linear mixing schedules improve performance over no mixing by $2-3$ p.p. compared to only augmented samples, the cosine annealing schedule proves optimal, boosting accuracy by $3-4$ p.p. + +\textbf{Edge smoothing.} +We evaluate the impact of using Gaussian blurring to smooth the edges of the foreground masks. +% Similarly, applying edge smoothing to foreground masks with Gaussian blurring actually hurts performance on Tiny\name, but slightly improves it on \name. +For larger models, this gives us a slight performance boost on the full ImageNet (second to last line in \Cref{tab:ablation-recombine}). \textbf{Background strategy.} Another point is the allowed choice of background image for each foreground object. @@ -116,302 +154,344 @@ Another point is the allowed choice of background image for each foreground obje We compare using the original background, a background from the same class, and any background. These strategies go from low diversity and high shared information content between the foreground and background to high diversity and low shared information content. For \emph{ViT-Ti}, the latter two strategies perform comparably, while \emph{ViT-S} benefits from the added diversity of using any background. -The same is true when training on the full (ImageNet) version of \name. +The same is true when training on the full ImageNet. -\begin{figure} - \centering - \includegraphics[width=.7\columnwidth]{img/bates.pdf} - \caption{Plot of the probability distribution function (PDF) of the extended Bates distribution for different parameters $\eta$. Higher values of $\eta$ concentrate the distribution around the center.} - \label{fig:bates-pdf} -\end{figure} \begin{table} + \caption{Accuracy of ViT-S on TinyImageNet (TIN) in percent using \schemename with different foreground position distributions by varying the Bates parameter $\eta$. + The best performance is achieved when using the uniform distribution ($\eta=1$) for training.} + \label{tbl:foreground-eta} \centering - \resizebox{\columnwidth}{!}{ + \small + \resizebox{.9\columnwidth}{!}{ \begin{tabular}{ccccccc} \toprule - \multirow{2.5}{*}{\makecell{Training Set/ \\ Bates Parameter}} & \multirow{2.5}{*}{TIN} & \multicolumn{5}{c}{Tiny\name} \\ + \multirow{2.5}{*}{\makecell{Bates Parameter \\during training}} & \multirow{2.5}{*}{\makecell{TIN \\w/o \schemename}} & \multicolumn{5}{c}{TIN w/ \schemename} \\ \cmidrule(l){3-7} - & & $\eta=-3$ & $-2$ & $1/-1$ & $2$ & $3$ \\ + & & $\eta=-3$ & $-2$ & $1/-1$ & $2$ & $3$ \\ \midrule - TinyImageNet & 68.9 & 60.5 & 60.2 & 60.8 & 62.6 & 63.1 \\ - $\eta=-3$ & 71.3 & 79.3 & 79.5 & 79.1 & 79.3 & 79.1 \\ - $\eta=-2$ & 71.5 & 80.0 & 78.7 & 79.3 & 79.1 & 78.8 \\ - $\eta=1/-1$ & 72.3 & 79.5 & 78.9 & 80.2 & 79.7 & 80.4 \\ - $\eta=2$ & 71.3 & 78.2 & 77.8 & 79.1 & 79.6 & 79.9 \\ - $\eta=3$ & 71.4 & 77.2 & 76.9 & 78.6 & 79.6 & 79.7 \\ + Baseline & 68.9 & 60.5 & 60.2 & 60.8 & 62.6 & 63.1 \\ + $\eta=-3$ & 71.3 & 79.3 & 79.5 & 79.1 & 79.3 & 79.1 \\ + $\eta=-2$ & 71.5 & 80.0 & 78.7 & 79.3 & 79.1 & 78.8 \\ + $\eta=1/-1$ & 72.3 & 79.5 & 78.9 & 80.2 & 79.7 & 80.4 \\ + $\eta=2$ & 71.3 & 78.2 & 77.8 & 79.1 & 79.6 & 79.9 \\ + $\eta=3$ & 71.4 & 77.2 & 76.9 & 78.6 & 79.6 & 79.7 \\ \bottomrule \end{tabular}} - \caption{Accuracy of ViT-S trained on TinyImageNet (TIN) and Tiny\name with different foreground position distributions by varying the parameter of a Bates distribution $\eta$. - The best performance is achieved using the uniform distribution ($\eta=1$).} \end{table} \textbf{Foreground position.} -Finally, we analyze the foreground object's positioning in the image. -We utilize an extended Bates distribution to sample the position of the foreground object. -The Bates distribution~\cite{Bates1955} with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}. -Therefore, the larger $\eta$, the more concentrated the distribution is around the center. -We extend this concept to $\eta \leq -1$ by defining ${X \sim \text{Bates}(\eta) :\Leftrightarrow s(X) \sim \text{Bates}(-\eta)}$ for $\eta \leq 1$ with $s$ being the sawtooth function on $[0, 1]$: -\begin{align} - s(x) = \begin{cases} - x + 0.5 & \text{if } 0 < x < 0.5 \\ - x - 0.5 & \text{if } 0.5 \leq x \leq 1 - \end{cases} -\end{align} -Note that $s \circ s = \id$ on $[0, 1]$. -This way, distributions with $\eta \leq -1$ are more concentrated around the borders. -$\eta = 1$ and $\eta = -1$ both correspond to the uniform distribution. -The PDF of this extended Bates distribution is visualized in \Cref{fig:bates-pdf}. - -When sampling more towards the center of the image, the difficulty of the task is reduced, which then reduces the performance on TinyImageNet. -This is reflected in the performance when evaluating on Tiny\name with $\eta=2$ and $\eta=3$ compared to $\eta=-1/1$. +Finally, we analyze the foreground object's positioning in the image, using a +generalization of the Bates distribution~\cite{Bates1955} with parameter $\eta \in \Z$. +The Bates distribution presents an easy way to sample from a bounded domain with just one hyperparameter that controls its concentration. +$\eta = 1/-1$ corresponds to the uniform distribution; $\eta > 1$ concentrates the distribution around the center; and for $\eta < -1$, the distribution is concentrated at the borders (see supplementary material for details). +% We utilize an extended Bates distribution to sample the position of the foreground object. +% The Bates distribution with parameter $\eta \geq 1$ is the mean of $\eta$ independent uniformly distributed random variables \cite{Jonhson1995}. +% The larger $\eta$, the more concentrated the distribution is at the center, $\eta < -1$ concentrates the distribution at the edges. +% We extend this concept to $\eta \leq -1$, shifting the distribution away from the center and towards the edges. +When sampling more towards the center of the image, the difficulty of the task is reduced, which reduces performance on TinyImageNet (\Cref{tbl:foreground-eta}). +This is reflected in the performance when evaluating using \schemename with $\eta=2$ and $\eta=3$ compared to $\eta=-1/1$. We observe a similar reduction for $\eta < -1$. -This experiment is conducted using the LaMa infill model. +% This experiment is conducted using the LaMa infill model. \begin{table} - \centering - \small - \begin{tabular}{lccc} - \toprule - Dataset & Classes & \makecell{Training \\ Images} & \makecell{Validation \\ Images} \\ - \midrule - TinyImageNet & 200 & 100,000 & 10,000 \\ - Tiny\name & 200 & 99,404 & 9,915 \\ - ImageNet & 1,000 & 1,281,167 & 50,000 \\ - \name & 1,000 & 1,274,557 & 49,751 \\ - \bottomrule - \end{tabular} - \caption{Dataset statistics for TinyImageNet, Tiny\name, ImageNet, and \name. For \name and Tiny\name we report the number of foreground/background pairs.} + \caption{Dataset statistics for TinyImageNet and ImageNet with and without \schemename. For \schemename we report the number of foreground/background pairs.} \label{tab:dataset-stats} + \centering + \resizebox{.9\columnwidth}{!}{ + \begin{tabular}{l S[table-format=4.0] S[table-format=7.0] S[table-format=5.0]} + \toprule + Dataset & {Classes} & {\makecell{Training \\ Images}} & {\makecell{Validation \\ Images}} \\ + \midrule + TinyImageNet & 200 & 100000 & 10000 \\ + TinyImageNet + \schemename & 200 & 99404 & 9915 \\ + ImageNet & 1000 & 1281167 & 50000 \\ + ImageNet + \schemename & 1000 & 1274557 & 49751 \\ + \bottomrule + \end{tabular}} \end{table} -After fixing the optimal design parameters in \Cref{tab:ablation} (last row), we construct the full \name dataset using the entire ImageNet dataset. -\Cref{tab:dataset-stats} compares the dataset statistics of ImageNet and \name. +After fixing the optimal design parameters in \Cref{tab:ablation-segment,tab:ablation-recombine} (last rows), we run \schemename's segmentation step on the entire ImageNet dataset. +\Cref{tab:dataset-stats} shows the resulting dataset statistics. % The slightly lower number of images in \name is due to \emph{Grounded SAM} returning no or invalid detections for some images. -The slightly reduced image count in \name is due to instances where Grounded SAM failed to produce valid object detections. +The slightly reduced image count for \schemename is due to instances where Grounded SAM fails to produce valid segmentation masks. + \subsection{Image Classification Results} \begin{table} - \centering - \begin{tabular}{lccc} - \toprule - \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy \\ when trained on}} & \multirow{2.5}{*}{Delta} \\ - \cmidrule(lr){2-3} - & ImageNet & \name & \\ - \midrule - ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{+2.3} \\ - ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{+3.5} \\ - ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{+4.5} \\ - \midrule - Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{+1.8} \\ - Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{+1.2} \\ - \midrule - ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{+0.5} \\ - ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{+1.0} \\ - \bottomrule - \end{tabular} - \caption{ImageNet results of models trained on \name and on ImageNet directly. \name improves the performance of all models in our test.} + \caption{ImageNet results of models trained on ImageNet with and without \schemename. \schemename improves the performance of most models, with a larger gain for larger models.} \label{tab:imagenet-results} -\end{table} - -\Cref{tab:imagenet-results} compares the ImageNet performance of models trained on \name and ones trained directly on ImageNet. -We adopt the training setup of \cite{Nauen2023} and \cite{Touvron2022} (details in \Cref{sec:training_setup}) for training ViT \cite{Dosovitskiy2021}, Swin \cite{Liu2021} and ResNet \cite{He2016} models. -Notably, \name improves performance across all tested architectures, including the ResNet models (up to $1$ p.p.), demonstrating benefits beyond Transformers. -For Transformer models, we observe improvements from $1.2$ p.p. to $4.5$ p.p. -This improvement is more substantial for the larger models, with ViT-L gaining $4.5$ p.p. in accuracy. -\name's improvements mostly counteract the drop in performance due to overfitting for large models. -When training on ImageNet, this drop is $3.8$ p.p. from ViT-S to ViT-L, while for \name it is reduced to $1.6$ p.p. - - -\begin{table} \centering - \resizebox{\columnwidth}{!}{\begin{tabular}{lccccc} + \small + \resizebox{.8\columnwidth}{!}{\begin{tabular}{lccc} \toprule - Model & Aircraft & Cars & Flowers & Food & Pets \\ + \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{ImageNet Accuracy [\%]}} & \multirow{2.5}{*}{Delta} \\ + \cmidrule(lr){2-3} + & w/o \schemename & w/ \schemename & \\ \midrule - ViT-S @ ImageNet & $72.4\pm1.0$ & $89.8\pm0.3$ & $94.5\pm0.2$ & $89.1\pm0.1$ & $93.8\pm0.2$ \\ - ViT-S @ \name & $78.6\pm0.5$ & $92.2\pm0.2$ & $95.5\pm0.2$ & $89.6\pm0.1$ & $94.5\pm0.2$ \\ - & \grntxt{+6.2} & \grntxt{+2.4} & \grntxt{+1.0} & \grntxt{+0.5} & \grntxt{+0.7} \\ - \cmidrule(r){1-1} - ViT-B @ ImageNet & $71.7\pm0.5$ & $90.0\pm0.2$ & $94.8\pm0.4$ & $89.8\pm0.2$ & $94.1\pm0.4$ \\ - ViT-B @ \name & $79.0\pm2.2$ & $93.3\pm0.1$ & $ 96.5\pm0.1$ & $90.9\pm0.1$ & $95.1\pm0.4$ \\ - & \grntxt{+7.3} & \grntxt{+3.3} & \grntxt{+1.7} & \grntxt{+1.1} & \grntxt{+1.0} \\ - \cmidrule(r){1-1} - ViT-L @ ImageNet & $72.1\pm1.0$ & $88.8\pm0.3$ & $94.4\pm0.3$ & $90.1\pm0.2$ & $94.2\pm0.4$ \\ - ViT-L @ \name & $77.6\pm1.2$ & $89.1\pm0.2$ & $96.6\pm0.1$ & $91.3\pm0.1$ & $95.1\pm0.1$ \\ - & \grntxt{+5.5} & \grntxt{+0.3} & \grntxt{+2.2} & \grntxt{+1.2} & \grntxt{+0.9} \\ + ViT-S & $79.1\pm0.1$ & $81.4\pm0.1$ & \grntxt{$+2.3$} \\ + ViT-B & $77.6\pm0.2$ & $81.1\pm0.4$ & \grntxt{$+3.5$} \\ + ViT-L & $75.3\pm0.4$ & $79.8\pm0.1$ & \grntxt{$+4.5$} \\ \midrule - Swin-Ti @ ImageNet & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\ - Swin-Ti @ \name & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\ - & \grntxt{+4.1} & \grntxt{+2.5} & \grntxt{+0.3} & \grntxt{+0.4} & \grntxt{+0.6} \\ - \cmidrule(r){1-1} - Swin-S @ ImageNet & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\ - Swin-S @ \name & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\ - & \grntxt{+5.7} & \grntxt{+2.1} & \grntxt{+1.4} & \grntxt{+0.1} & \grntxt{+0.5} \\ + DeiT-S & $80.1 \pm 0.1$ & $80.0\pm0.3$ & \gtxt{$-0.1$} \\ + DeiT-B & $81.9 \pm 0.3$ & $81.9\pm0.2$ & \gtxt{$\pm0.0$} \\ + DeiT-L & $79.3\pm2.3$ & $82.4\pm0.1$ & \grntxt{$+3.1$} \\ \midrule - ResNet-50 @ ImageNet & $78.2\pm0.5$ & $89.8\pm0.2$ & $91.7\pm0.4$ & $84.4\pm0.2$ & $93.7\pm0.3$ \\ - ResNet-50 @ \name & $80.3\pm0.4$ & $90.4\pm0.2$ & $91.7\pm0.2$ & $84.5\pm0.2$ & $93.7\pm0.3$ \\ - & \grntxt{+2.1} & \grntxt{+0.6} & \gtxt{$\pm$0} & \grntxt{+0.1} & \gtxt{$\pm$0} \\ - \cmidrule(r){1-1} - ResNet-101 @ ImageNet & $78.4\pm0.6$ & $90.3\pm0.1$ & $91.2\pm0.5$ & $86.0\pm0.2$ & $94.3\pm0.2$ \\ - ResNet-101 @ \name & $81.4\pm0.5$ & $91.3\pm0.1$ & $92.9\pm0.2$ & $86.3\pm0.1$ & $94.0\pm0.3$ \\ - & \grntxt{+3.0} & \grntxt{+1.3} & \grntxt{+1.7} & \grntxt{+0.3} & \textcolor{red}{-0.3} \\ + Swin-Ti & $77.9\pm0.2$ & $79.7\pm0.1$ & \grntxt{$+1.8$} \\ + Swin-S & $79.4\pm0.1$ & $80.6\pm0.1$ & \grntxt{$+1.2$} \\ + \midrule + ResNet-50 & $78.3\pm0.1$ & $78.8\pm0.1$ & \grntxt{$+0.5$} \\ + ResNet-101 & $79.4\pm0.1$ & $80.4\pm0.1$ & \grntxt{$+1.0$} \\ \bottomrule \end{tabular}} - \caption{Downstream accuracy in percent when finetuning on other datasets. Models were pretrained on \name and ImageNet. Pretraining on \name increases Transformer downstream accuracy on all datasets.} \end{table} -To assess the transferability of \name-trained models, we finetune models pretrained on ImageNet and \name on five fine-grained datasets: -FGVC-Aircraft \cite{Maji2013}, Stanford Cars~\cite{Dehghan2017}, Oxford Flowers \cite{Nilsback2008}, Food-101 \cite{Kaur2017}, and Oxford-IIIT Pets \cite{Parkhi2012}. -While for ResNets, the performance of both training datasets is about the same, for every Transformer, we see the accuracy improve on all downstream dataset by up to 7.3 p.p. and a reduction of error rate of up to $39.3\%$. -In summary, these results demonstrate that the improved representation learning achieved by training on \name translates to superior performance not only on ImageNet, but also on a variety of fine-grained image classification tasks. +\Cref{tab:imagenet-results} compares the ImageNet performance of models trained with and without \schemename. +We adopt the training setup of \cite{Nauen2025} and \cite{Touvron2022} for training ViT \cite{Dosovitskiy2021}, Swin \cite{Liu2021} and ResNet \cite{He2016} (representing CNNs) models as well as the setup of DeiT \cite{Touvron2021b} for that model. +Both setups are using strong data augmentations like RandAugment, CutMix, and Mixup optimized for Transformers (details in supplementary material). +Notably, \schemename improves performance across all tested architectures, including the ResNet models, % (up to $1$ p.p.), +demonstrating benefits beyond Transformers. +For DeiT we only observe benefits on ImageNet for the larger models. +For other transformers, we observe improvements from $1.2$ p.p. to $4.5$ p.p. with increasing gains for larger models. +% This improvement is more substantial for the larger models, with ViT-L gaining $4.5$ p.p. in accuracy. +\schemename's improvements counteract the drop in performance for increasing model sizes. +Without \schemename this drop is $3.8$ p.p. (ViT-S to L), while with \schemename it is reduced to $1.6$ p.p. +For DeiT there is a drop of $0.8$ p.p. from small to large while when using \schemename there is a \emph{gain} of $2.4$ p.p. -\subsection{Further Model Evaluation} -% Additional to just using \name for training, its special properties and posibilities for adjustment of the data distribution make it a valuable tool for evaluating other model properties and biases. -Beyond its use for training, \name's unique properties and controlled data generation capabilities make it a powerful tool for analyzing model behavior and biases. - -\paragraph*{Background Robustness} \begin{table} + \caption{Comparison of \schemename and simple Copy-Paste methods. We train ViT-S on ImageNet using the same 3-augment data augmentation on top of the copy-paste augmentation.} + \label{tab:copy-paste-comparison} \centering - \begin{tabular}{lccc} - \toprule - \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Background Robustness \\ when trained on}} & \multirow{2.5}{*}{Delta} \\ - \cmidrule(lr){2-3} - & ImageNet & \name & \\ - \midrule - ViT-S & $0.73\pm0.01$ & $0.99\pm0.01$ & \grntxt{+0.26} \\ - ViT-B & $0.72\pm0.01$ & $1.00\pm0.01$ & \grntxt{+0.28} \\ - ViT-L & $0.70\pm0.01$ & $1.00\pm0.01$ & \grntxt{+0.30} \\ - \midrule - Swin-Ti & $0.72\pm0.01$ & $1.00\pm0.01$ & \grntxt{+0.28} \\ - Swin-S & $0.72\pm0.01$ & $1.00\pm0.01$ & \grntxt{+0.28} \\ - \midrule - ResNet-50 & $0.79\pm0.01$ & $0.99\pm0.01$ & \grntxt{+0.20} \\ - ResNet-101 & $0.79\pm0.01$ & $1.00\pm0.01$ & \grntxt{+0.21} \\ - \bottomrule - \end{tabular} - \caption{Evaluation of the background robustness of models trained on \name and on ImageNet directly. Training on \name improves the background robustness of all model to $\approx1.00$, meaning the model is indifferent to the choice of background.} - \label{tab:background-robustness} + \resizebox{\columnwidth}{!}{ + \begin{tabular}{lcc S[table-format=+2.1,retain-explicit-plus,detect-inline-weight=math,detect-weight=true]} + \toprule + Augmentation & labels & \makecell{ Accuracy [\%]} & {\makecell{Delta \\to Prev.}} \\ + \midrule + % Baseline & & $79.1 \pm 0.1$ \\ + Baseline + \textbf{Simple Copy-Paste} & bg & $31.3 \pm 0.6$ & \\ + + mixed labels & fg + bg & $32.0 \pm 0.8$ & +0.7 \\ + + fg labels & fg & $31.6 \pm 0.9$ & -0.4 \\ + + \emph{range} foreground size variation & \gtxt{fg} & $43.0 \pm 1.2$ & \bfseries +11.4 \\ + + infilled backgrounds & \gtxt{fg} & $68.7 \pm 0.2$ & \bfseries +25.7 \\ + + \emph{cos} mixing strategy & \gtxt{fg} & $81.2 \pm 0.1$ & \bfseries +12.5 \\ + + edge smoothing & \gtxt{fg} & $81.3 \pm 0.1$ & +0.1 \\ + + background pruning$=$ \textbf{\schemename} & \gtxt{fg} & $81.4 \pm 0.1$ & +0.1 \\ + \bottomrule + \end{tabular}} +\end{table} +\textbf{Comparison to Simple Copy-Paste.} +We compare \schemename to a simple adaption of the Copy-Paste augmentation inspired by \cite{Ge2023,Ghiasi2020,Shermaine2025} in \Cref{tab:copy-paste-comparison}. +Contrary to semantic segmentation we do not have foreground masks available. +Thus, we paste the extracted foreground objects from \emph{\schemename's segmentation stage} onto normal ImageNet images. +% Since such images do not have straight forward classification labels, we test multiple possibilities. +We observe 3 large jumps in accuracy: (\textbf{1}) From our \emph{range} foreground size variation (+11.4\%), (\textbf{2}) from using our infilled backgrounds instead of images from the dataset (+25.7\%), and (\textbf{3}) from our \emph{cos} mixing strategy with non-augmented images (+12.5\%). +\schemename's changes to the naive copy-paste augmentation are thus imperative for good classification performance. + +\begin{table}[t] + \caption{Downstream accuracy in percent when finetuning on other datasets. Models are pretrained on ImageNet with and without \schemename. Pretraining using \schemename increases transformer downstream accuracy. + % on all datasets. + } + \label{tab:downstream-results} + \centering + \resizebox{\columnwidth}{!}{\begin{tabular}{lcccccc} + \toprule + Model & \schemename & Aircraft & Cars & Flowers & Food & Pets \\ + \midrule + ViT-S & \xmark & $72.4\pm1.0$ & $89.8\pm0.3$ & $94.5\pm0.2$ & $89.1\pm0.1$ & $93.8\pm0.2$ \\ + ViT-S & \cmark & $78.6\pm0.5$ & $92.2\pm0.2$ & $95.5\pm0.2$ & $89.6\pm0.1$ & $94.5\pm0.2$ \\ + & & \grntxt{$+6.2$} & \grntxt{$+2.4$} & \grntxt{$+1.0$} & \grntxt{$+0.5$} & \grntxt{$+0.7$} \\ + \cmidrule(r){1-1} + ViT-B & \xmark & $71.7\pm0.5$ & $90.0\pm0.2$ & $94.8\pm0.4$ & $89.8\pm0.2$ & $94.1\pm0.4$ \\ + ViT-B & \cmark & $79.0\pm2.2$ & $93.3\pm0.1$ & $ 96.5\pm0.1$ & $90.9\pm0.1$ & $95.1\pm0.4$ \\ + & & \grntxt{$+7.3$} & \grntxt{$+3.3$} & \grntxt{$+1.7$} & \grntxt{$+1.1$} & \grntxt{$+1.0$} \\ + \cmidrule(r){1-1} + ViT-L & \xmark & $72.1\pm1.0$ & $88.8\pm0.3$ & $94.4\pm0.3$ & $90.1\pm0.2$ & $94.2\pm0.4$ \\ + ViT-L & \cmark & $77.6\pm1.2$ & $89.1\pm0.2$ & $96.6\pm0.1$ & $91.3\pm0.1$ & $95.1\pm0.1$ \\ + & & \grntxt{$+5.5$} & \grntxt{$+0.3$} & \grntxt{$+2.2$} & \grntxt{$+1.2$} & \grntxt{$+0.9$} \\ + \midrule + DeiT-S & \xmark & $75.3\pm0.4$ & $91.1\pm0.2$ & $94.8\pm0.4$ & $89.2\pm0.2$ & $92.4\pm0.2$ \\ + DeiT-S & \cmark & $76.8\pm0.8$ & $91.9\pm0.2$ & $95.2\pm0.3$ & $89.1\pm0.2$ & $92.3\pm0.4$ \\ + & & \grntxt{$+1.5$} & \grntxt{$+0.8$} & \grntxt{$+0.4$} & \gtxt{$-0.1$} & \gtxt{$-0.1$} \\ + \cmidrule(r){1-1} + DeiT-B & \xmark & $77.0\pm1.2$ & $92.9\pm0.2$ & $96.1\pm0.2$ & $91.2\pm0.1$ & $93.3\pm0.4$ \\ + DeiT-B & \cmark & $79.3\pm0.3$ & $93.1\pm0.1$ & $96.4\pm0.2$ & $91.3\pm0.1$ & $93.3\pm0.1$ \\ + & & \grntxt{$+2.3$} & \gtxt{$+0.2$} & \grntxt{$+0.3$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\ + \cmidrule(r){1-1} + DeiT-L & \xmark & $72.8\pm5.5$ & $92.8\pm1.0$ & $95.8\pm1.5$ & $90.5\pm2.6$ & $92.4\pm2.0$ \\ + DeiT-L & \cmark & $78.8\pm0.8$ & $93.8\pm0.2$ & $97.0\pm0.2$ & $92.0\pm0.2$ & $93.5\pm0.2$ \\ + & & \grntxt{$+6.0$} & \grntxt{$+1.0$} & \grntxt{$+1.2$} & \grntxt{$+1.5$} & \grntxt{$+1.1$} \\ + \midrule + Swin-Ti & \xmark & $77.0\pm0.1$ & $91.3\pm0.6$ & $95.9\pm0.1$ & $90.0\pm0.2$ & $94.2\pm0.1$ \\ + Swin-Ti & \cmark & $81.1\pm0.8$ & $92.8\pm0.4$ & $96.2\pm0.1$ & $90.4\pm0.3$ & $94.8\pm0.5$ \\ + & & \grntxt{$+4.1$} & \grntxt{$+2.5$} & \grntxt{$+0.3$} & \grntxt{$+0.4$} & \grntxt{$+0.6$} \\ + \cmidrule(r){1-1} + Swin-S & \xmark & $75.7\pm1.4$ & $91.0\pm0.3$ & $95.9\pm0.5$ & $91.1\pm0.2$ & $94.4\pm0.1$ \\ + Swin-S & \cmark & $81.4\pm0.2$ & $93.1\pm0.2$ & $96.3\pm0.3$ & $91.2\pm0.2$ & $94.9\pm0.3$ \\ + & & \grntxt{$+5.7$} & \grntxt{$+2.1$} & \grntxt{$+1.4$} & \gtxt{$+0.1$} & \grntxt{$+0.5$} \\ + \midrule + ResNet-50 & \xmark & $78.2\pm0.5$ & $89.8\pm0.2$ & $91.7\pm0.4$ & $84.4\pm0.2$ & $93.7\pm0.3$ \\ + ResNet-50 & \cmark & $80.3\pm0.4$ & $90.4\pm0.2$ & $91.7\pm0.2$ & $84.5\pm0.2$ & $93.7\pm0.3$ \\ + & & \grntxt{$+2.1$} & \grntxt{$+0.6$} & \gtxt{$\pm0.0$} & \gtxt{$+0.1$} & \gtxt{$\pm0.0$} \\ + \cmidrule(r){1-1} + ResNet-101 & \xmark & $78.4\pm0.6$ & $90.3\pm0.1$ & $91.2\pm0.5$ & $86.0\pm0.2$ & $94.3\pm0.2$ \\ + ResNet-101 & \cmark & $81.4\pm0.5$ & $91.3\pm0.1$ & $92.9\pm0.2$ & $86.3\pm0.1$ & $94.0\pm0.3$ \\ + & & \grntxt{$+3.0$} & \grntxt{$+1.3$} & \grntxt{$+1.7$} & \grntxt{$+0.3$} & \textcolor{red}{$-0.3$} \\ + \bottomrule + \end{tabular}} \end{table} +\textbf{Downstream tasks.} To assess the transferability of \schemename-trained models, we finetune models pretrained on ImageNet with and without \schemename on five fine-grained datasets: +FGVC-Aircraft \cite{Maji2013}, Stanford Cars~\cite{Dehghan2017}, Oxford Flowers \cite{Nilsback2008}, Food-101 \cite{Kaur2017}, and Oxford-IIIT Pets \cite{Parkhi2012}. +% While for ResNets, the performance of both training datasets is about the same, +In \Cref{tab:downstream-results} we see transformer accuracies improve on all these datasets by up to 7.3 p.p. +% and a reduction of error rate of up to $39.3\%$. +% Notably, training with \name increases the downstream performance of DeiT-S and DeiT-B, even though the ImageNet results were the same. +% This demonstrates that the improved representations from training on \name translate to superior performance beyond gains from better ImageNet performance. +Notably, training with \schemename boosts the downstream performance of DeiT-S and DeiT-B, despite similar ImageNet results. +This shows the improved representations from training with \schemename translate to gains beyond better ImageNet scores. +% not only on ImageNet, but also on fine-grained image classification tasks. + + +\subsection{Bias and Robustness Evaluation} +% Additional to just using \name for training, its special properties and posibilities for adjustment of the data distribution make it a valuable tool for evaluating other model properties and biases. +Beyond its use for training, \schemename's unique properties and controlled data generation capabilities make it a powerful tool for analyzing behavior and biases of black-box models. + +\begin{figure*} + \centering + \includegraphics[width=.95\textwidth]{img/bg_robustness.pdf} + \caption{Evaluation of background robustness on ImageNet + \schemename, ImageNet9 and CounterAnimal. + We plot the in-distribution (top of arrow) and the out-of-distribution (bottom of arrow) accuracy when training with and without \schemename. + We annotate each arrow with its length $\Delta$. + Training with \schemename improves the background robustness of all transformers by mostly boosting the out-of-distribution accuracy. + } + \label{fig:background-robustness} +\end{figure*} + +\textbf{Background Robustness.} % By adjusting the background distribution from using a background from an image of the same class as the foreground to using any background, we can evaluate the robustness of models to shifts in the background distribution. % We assess background robustness by changing the background distribution, comparing accuracy with backgrounds of the same class as the foreground to using any background. We assess the robustness of models to shifts in the background distribution from a class-related background to any background. % We define the background robustness coefficient to be the accuracy of a model on \name when using the same class background divided by the accuracy when using any background: -Background robustness is defined to be the ratio of accuracy on \name with same-class backgrounds to accuracy with any background: -\begin{align} - \text{Background Robustness} = \frac{\text{Acc}(\name_\text{all})}{\text{Acc}(\name_\text{same})} -\end{align} -It represents the relative drop in performance under a background distribution shift. -\Cref{tab:background-robustness} presents the background robustness of various models. -When trained on ImageNet, smaller models generally exhibit greater robustness to changes in the background distribution than larger models and ResNet is more robust than the tested Transformer models. -Crucially, training on \name instead of ImageNet improves the background robustness of all models to $\approx1.00$, meaning that these models are agnostic to the choice of background and only classify based on the foreground. -These findings highlight the generalization benefits of \name. +% Background robustness is defined to be the ratio of accuracy on \name with same-class backgrounds to accuracy with any background: +% \begin{align} +% \text{Background Robustness} = \frac{\text{Acc}(\name_\text{all})}{\text{Acc}(\name_\text{same})} +% \end{align} +% It represents the relative drop in performance under a background distribution shift. +\Cref{fig:background-robustness} presents the background robustness results for three datasets: ImageNet with \schemename (all backgrounds vs. backgrounds of same class), ImageNet9 \cite{Xiao2020} (random backgrounds vs. original backgrounds), and CounterAnimal \cite{Wang2024f} (counter vs. common background). +The top triangle of each arrow represents the in-distribution backgrounds and the bottom triangle represents the out-of-distribution ones. +We follow ImageNet9 and CounterAnimal and assess the background robustness in terms of the accuracy gap when evaluating a model on images of normal background distribution compared to out-of-distribution backgrounds (length of each arrow; $\Delta$). +% When trained on ImageNet, smaller models generally exhibit greater robustness to changes in the background distribution than larger models and ResNet is more robust than the tested Transformer models. +Crucially, \schemename improves the background robustness of all models and across datasets, reducing the background-gap by boosting the performance on the out-of-background-distribution samples more than the in-distribution ones. +% to $\approx1.00$, meaning that these models are agnostic to the choice of background and only classify based on the foreground. +These findings highlight the generalization benefits of \schemename to unusual image compositions. -\paragraph*{Foreground Focus} -\begin{table} +\begin{figure*} \centering - \resizebox{\columnwidth}{!}{ - \begin{tabular}{lcccccc} - \toprule - \multirow{4}{*}{Model} & \multicolumn{6}{c}{Foreground Focus when trained on} \\ - \cmidrule(l){2-7} - & IN & FN & IN & FN & IN & FN \\ - \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(l){6-7} - & \multicolumn{2}{c}{GradCam} & \multicolumn{2}{c}{GradCam++} & \multicolumn{2}{c}{IG} \\ - \midrule - ViT-S & $1.2\pm0.1$ & $2.3\pm0.3$ & $1.2\pm0.1$ & $2.1\pm0.4$ & $1.9\pm0.1$ & $2.7\pm0.1$ \\ - ViT-B & $1.2\pm0.1$ & $2.4\pm0.7$ & $1.1\pm0.1$ & $2.1\pm0.1$ & $1.7\pm0.1$ & $2.7\pm0.1$ \\ - ViT-L & $1.3\pm0.1$ & $1.6\pm0.1$ & $1.1\pm0.1$ & $1.3\pm0.1$ & $1.3\pm0.1$ & $2.6\pm0.1$ \\ - \midrule - Swin-Ti & $0.9\pm0.1$ & $0.7\pm0.1$ & $1.0\pm0.3$ & $0.7\pm0.3$ & $2.5\pm01$ & $4.8\pm0.3$ \\ - Swin-S & $0.8\pm0.1$ & $0.7\pm0.1$ & $0.7\pm0.1$ & $0.7\pm0.4$ & $2.4\pm0.1$ & $4.6\pm0.3$ \\ - \midrule - ResNet-50 & $2.2\pm0.1$ & $2.7\pm0.1$ & $2.0\pm0.1$ & $2.9\pm0.1$ & $3.2\pm0.1$ & $4.9\pm0.2$ \\ - ResNet-101 & $2.3\pm0.1$ & $2.8\pm0.1$ & $2.2\pm0.1$ & $3.0\pm0.1$ & $3.2\pm0.1$ & $4.8\pm0.1$ \\ - \bottomrule - \end{tabular}} - \caption{Evaluation of the foreground focus using GradCam, GradCam++ and IntegratedGradients of models trained on \name (FN) and on ImageNet (IN) directly. Training on \name improves the foreground focus of almost all models.} - \label{tab:foreground-focus} -\end{table} + \includegraphics[width=.95\textwidth]{img/fg_focus.pdf} + \caption{Evaluation of the foreground focus (\Cref{eq:fg-focus}) using GradCam, GradCam++ and IntegratedGradients (IG) of models trained on ImageNet. Training with \schemename improves the foreground focus of almost all models.} + \label{fig:foreground-focus} +\end{figure*} -Leveraging our inherent knowledge of the foreground masks when using \name, as well as common XAI techniques~\cite{Selvaraju2016,Chattopadhay2018,Sundararajan2017}, we can evaluate a model's focus on the foreground object. -We can directly evaluate ImageNet trained models, but this technique can also be extended to other datasets without relying on manually annotated foreground-masks. -To evaluate the foreground focus, we employ Grad-CAM \cite{Selvaraju2016}, Grad-CAM++ \cite{Chattopadhay2018} or IntegratedGradients (IG) \cite{Sundararajan2017} to compute the per-pixel importance of an image for the model's prediction. +\textbf{Foreground Focus.} +Leveraging our inherent knowledge of the foreground masks when using \schemename, as well as common XAI techniques~\cite{Selvaraju2016,Chattopadhay2018,Sundararajan2017}, we can evaluate a model's focus on the foreground object. +% I.e. we measure how much the model's decision depends on the foreground. +We can directly evaluate ImageNet-trained models, but this technique can also be extended to other datasets without relying on manually annotated foreground masks. +To evaluate the foreground focus, we employ Grad-CAM \cite{Selvaraju2016}, Grad-CAM++ \cite{Chattopadhay2018} and IntegratedGradients (IG) \cite{Sundararajan2017} to compute the per-pixel importance of an image for the model's prediction. The foreground focus is defined to be the ratio of the foreground's relative importance to its relative size in the image: -\begin{align} +\begin{align} \label{eq:fg-focus} \text{FG Focus}(\text{img}) = \frac{\text{Area}(\text{img}) \hspace{3pt} \text{Importance}(\text{fg})}{\text{Area}(\text{fg}) \hspace{3pt} \text{Importance}(\text{img})} \end{align} -The foreground focus of a model is its average foreground focus over all test images. -\Cref{tab:foreground-focus} presents our findings. -Training on \name significantly increasees the foreground focus of ViT and ResNet across all metrics used. -For Swin, the foreground focus stagnates when measured using GradCam and GradCam++, but almost doubles when using IG. +If all pixels uniformly receive the same importance value, the foreground focus is one. +The foreground focus of a model is its average focus over all test images. +\Cref{fig:foreground-focus} presents our findings. +Using \schemename significantly increases the foreground focus of ViT, DeiT and ResNet across all XAI metrics. +% I.e. \schemename-trained models base their decision more on the foreground object compared to the background than models trained without \schemename. +% For Swin, the foreground focus stagnates when measured using GradCam and GradCam++, but almost doubles when using IG. +% We hypothesize that Swin's below-uniform foreground focus reported with GradCam is due to its specific implementation for Swin. +We hypothesize Swin's below-uniform foreground focus with GradCam is due to its specific implementation. % These differences might be due to the way GradCam is calculated for Swin \todo{cite package website where this is from} and the \todo{common critique of GradCam}. -\paragraph*{Center Bias} -\begin{table} +\begin{table}[t] + \caption{ + % Evaluation of the center bias. + Accuracy relative to the center accuracy of multiple instantiations of the models when the foreground objects is in different cells of a $3 \times 3$ grid. + We calculate center bias according to \Cref{eq:center-bias}. + Using \schemename significantly reduces models' center bias.} + \label{tab:center-bias} \centering - \resizebox{\columnwidth}{!}{ + \resizebox{.78\columnwidth}{!}{ \begin{tabular}{lccc} \toprule - \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias when trained on}} & \multirow{2.5}{*}{Delta} \\ + \multirow{2.5}{*}{Model} & \multicolumn{2}{c}{\makecell{Center Bias [\%] when trained}} & \multirow{2.5}{*}{Delta} \\ \cmidrule(lr){2-3} - & ImageNet & \name \\ + & w/o \schemename & w/ \schemename \\ \midrule - ViT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNetAll_v3.pdf}} \\ - & $0.255\pm0.008$ & $0.220\pm003$ & \grntxt{-0.035} \\ - ViT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNetAll_v3.pdf}} \\ - & $0.254\pm0.004$ & $0.190\pm0.002$ & \grntxt{-0.064} \\ - ViT-L & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNetAll_v3.pdf}} \\ - & $0.243\pm0.011$ & $0.117\pm0.007$ & \grntxt{-0.126} \\ + ViT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-S_RecombNet_all_v3.pdf}} \\ + & $25.5\pm0.8$ & $22.0\pm0.3$ & \grntxt{$-3.5$} \\ + ViT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-B_RecombNet_all_v3.pdf}} \\ + & $25.4\pm0.4$ & $19.0\pm0.2$ & \grntxt{$-6.4$} \\ + ViT-L & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ViT-L_RecombNet_all_v3.pdf}} \\ + & $24.3\pm1.1$ & $11.7\pm0.7$ & \grntxt{$-12.6$} \\ \midrule - Swin-Ti & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNetAll_v3.pdf}} \\ - & $0.250\pm0.007$ & $0.165\pm0.002$ & \grntxt{-0.085} \\ - Swin-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNetAll_v3.pdf}} \\ - & $0.232\pm0.001$ & $0.156\pm002$ & \grntxt{-0.076} \\ + DeiT-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-S_fornet_all_linear_v3.pdf}} \\ + & $20.4 \pm 0.2$ & $21.2 \pm 0.1$ & \gtxt{$+0.8$} \\ + DeiT-B & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_vNone.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_ImageNet_v3.pdf} } & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-B_fornet_all_cos_v3.pdf}} \\ + & $19.0 \pm 0.7$ & $19.0 \pm 0.2$ & \gtxt{$\pm0.0$} \\ + DeiT-L & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_ImageNet_v3.pdf} } & \raisebox{-6pt}{ \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v1.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v2.pdf} \includegraphics[width=.08\columnwidth]{img/DeiT-L_fornet_all_cos_v3.pdf} } \\ + & $21.2 \pm 0.2$ & $18.0 \pm 0.2$ & \grntxt{$-3.2$} \\ \midrule - ResNet50 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNetAll_v3.pdf}} \\ - & $0.263\pm0.003$ & $0.197\pm0.003$ & \grntxt{-0.066} \\ - ResNet101 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNetAll_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNetAll_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNetAll_v3.pdf}} \\ - & $0.230\pm0.003$ & $0.199\pm002$ & \grntxt{-0.031} \\ + Swin-Ti & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-Ti_RecombNet_all_v3.pdf}} \\ + & $25.0\pm0.7$ & $16.5\pm0.2$ & \grntxt{$-8.5$} \\ + Swin-S & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/Swin-S_RecombNet_all_v3.pdf}} \\ + & $23.2\pm0.1$ & $15.6\pm0.2$ & \grntxt{$-7.6$} \\ + \midrule + ResNet50 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet50_RecombNet_all_v3.pdf}} \\ + & $26.3\pm0.3$ & $19.7\pm0.3$ & \grntxt{$-6.6$} \\ + ResNet101 & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_ImageNet_v3.pdf}} & \raisebox{-6pt}{\includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v1.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v2.pdf} \includegraphics[width=.08\columnwidth]{img/ResNet101_RecombNet_all_v3.pdf}} \\ + & $23.0\pm0.3$ & $19.9\pm0.2$ & \grntxt{$-3.1$} \\ \bottomrule \end{tabular} } - \includegraphics[width=.75\columnwidth]{img/colorbar_horizontal.pdf} - \caption{Evaluation of the position bias. We plot the accuracy relative to the center accuracy of multiple instantiations of the models when the foreground objects is in different cells a $3 \times 3$ grid. - Training on \name significantly reduces a models center bias.} - \label{tab:center-bias} + \includegraphics[width=.8\columnwidth]{img/colorbar_horizontal.pdf} \end{table} -With \name we have unique control over the position of the foreground object in the image. -This lets us quantify the center bias of ImageNet- and \name-trained models. -We divide the image into a $3 \times 3$ grid and evaluate model accuracy when the foreground object is in each of the $9$ grid cells. + +\textbf{Center Bias.} +With \schemename we have unique control over the position of the foreground object in the image. +This lets us quantify the center bias of models trained with and without \schemename. +We divide the image into a $3 \times 3$ grid and evaluate model accuracy when the (scaled-down) foreground object is in each of the $9$ grid cells. Each cell's accuracy is divided by the accuracy in the center cell for normalization, which gives us the relative performance drop when the foreground is in each part of the image. The center bias is calculated as one minus the average of the minimum performance of a corner cell and the minimum performance of a side cell: -\begin{align} - \begin{split} - & \text{Center Bias} = \\ - & \hspace{7pt} 1 - \frac{\min\limits_{a, b \in \{0, 2\}} \text{Acc}(\text{cell}_{(a, b)}) + \min\limits_{\substack{a=1 \text{ or } b=1 \\ a \neq b}} \text{Acc}(\text{cell}_{(a, b)})}{2 \text{Acc}(\text{cell}_{(1, 1)})} - \end{split} +% \begin{align} +% \begin{split} +% & \text{Center Bias} = \\ +% & \hspace{7pt} 1 - \frac{\min\limits_{a, b \in \{0, 2\}} \text{Acc}(\text{cell}_{(a, b)}) + \min\limits_{\substack{a=1 \text{ or } b=1 \\ a \neq b}} \text{Acc}(\text{cell}_{(a, b)})}{2 \text{Acc}(\text{cell}_{(1, 1)})} +% \end{split} +% \end{align} +\begin{align} \label{eq:center-bias} + \text{Center Bias} = 1 - \frac{\min\limits_{c \in \text{sides}} \text{Acc}(c) + \min\limits_{c \in \text{corners}} \text{Acc}(c)}{2 \text{Acc}(c_\text{center})} \end{align} \Cref{tab:center-bias} visualizes the center bias of three instantiations of each model. -Performance is generally highest in the center and the center top and bottom and center left and right cells, and lowest in the four corners. +Performance is generally highest in the center and lowest in the four corners. Interestingly, ImageNet-trained models perform slightly better when the foreground object is on the right side of the image, compared to the left side, despite our use of random flipping with a probability of $0.5$ during training. % Training on \name reduces the center bias of all models by at least half. -Training on \name significantly reduces center bias across all models. -This demonstrates that \name promotes a more uniform spatial attention distribution. -Their accuracy is higher in the center left and right cells than in the center top and bottom ones, which is not the case for ImageNet-trained models. +Using \schemename significantly reduces center bias across models, with a more uniform performance especially across the middle row. +% Their accuracy is higher in the center left and right cells than in the center top and bottom ones, which is not the case for ImageNet-trained models. +% This demonstrates that \schemename promotes a more uniform spatial attention distribution, counteracting the center-bias of ImageNet. +Thus, \schemename makes the model recognize objects across a wider spatial distribution, counteracting the center-bias of ImageNet. -\paragraph*{Size Bias} -\begin{figure} +\begin{figure}[t!] \centering - \includegraphics[width=.9\columnwidth]{img/size_bias.pdf} - \caption{Evaluation of the size bias of models trained on \name. We plot the accuracy relative to the accuracy when using the mean foreground size.} + \includegraphics[width=\columnwidth]{img/size_bias_grid.pdf} + \caption{Evaluation of the size bias of models trained on ImageNet. We plot the accuracy relative to the accuracy when using the default size ($f_\text{size} = 1.0$).} \label{fig:size-bias} \end{figure} -Finally, we evaluate the impact of different-sized foreground objects on the accuracy. + +\textbf{Size Bias.} +Finally, we evaluate the impact of different sized foreground objects on the accuracy. For this evaluation, we use the \emph{mean} foreground size strategy. We introduce a size factor $f_\text{size}$ by which we additionally scale the foreground object before pasting it onto the background. -Results are again normalized by the accuracy when using the mean foreground size ($f_\text{size} = 1.0$). -\Cref{fig:size-bias} shows the size bias curves of ViT-S and ViT-B when trained on ImageNet and \name. +Results are normalized by the accuracy when using $f_\text{size} = 1.0$. +\Cref{fig:size-bias} shows the size bias curves of models trained with and without \schemename. % When training on \name, the resulting model keeps it's good performance on smaller foreground objects, while models trained on ImageNet fall of faster and lower. -Models trained on \name maintain better performance even with smaller foreground objects, when ImageNet-trained models exhibit a more rapid performance decline. -Therefore, \name-training improves robustness to variations in object scale. +Models trained using \schemename maintain perform better, especially with smaller foreground objects. +%, when ImageNet-trained models exhibit a more rapid performance decline. +Therefore, \schemename-training improves robustness to variations in object scale, especially for larger models. diff --git a/sec/intro.tex b/sec/intro.tex index 645d9e0..ad4c720 100644 --- a/sec/intro.tex +++ b/sec/intro.tex @@ -15,46 +15,59 @@ \begin{figure} \centering \includegraphics[width=\columnwidth]{img/fig-1.pdf} - \caption{Comparison of \name and ImageNet. \name recombines foreground objects with different backgrounds each epoch, thus creating a more diverse training set. We still apply traditional data augmentation afterwards.} + \caption{Comparison of traditional image classification training and training when using \schemename. \schemename recombines foreground objects with different backgrounds each epoch, thus creating a more diverse training set. We still apply strong traditional data augmentation afterwards.} \label{fig:fig-1} \end{figure} -Image classification, a fundamental task in computer vision (CV), involves assigning a label to an image from a predefined set of categories. -This seemingly simple task underpins a wide range of applications, including medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2022b}, and object recognition~\cite{Carion2020,He2017,Girshick2013}. -Furthermore, image classification is used for large-scale pretraining of vision models~\cite{Dosovitskiy2021,Liu2021,Touvron2021b} and to judge the progress of the field of CV \cite{Khan2022, Rangel2024}. -The advent of large-scale datasets, particularly ImageNet \cite{Deng2009}, containing millions of labeled images across thousands of categories, has been instrumental in driving significant progress in this field. -ImageNet served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}. +Image classification, a fundamental task in computer vision (CV), involves assigning labels to images from a set of categories. +It underpins a wide range of applications, like medical diagnosis~\cite{Sanderson2022,Vezakis2024}, autonomous driving~\cite{Wang2022b}, and object recognition~\cite{Carion2020,He2017,Girshick2013} and facilitates large-scale pretraining~\cite{Dosovitskiy2021,Liu2021,Touvron2021b}, and progress evaluation in CV~\cite{Khan2022, Rangel2024}. +% Furthermore, image classification is used for large-scale pretraining of vision models~\cite{Dosovitskiy2021,Liu2021,Touvron2021b} and to judge the progress of the field of CV \cite{Khan2022, Rangel2024}. +The advent of large-scale datasets, particularly ImageNet~\cite{Deng2009}, served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}. +% containing millions of labeled images across thousands of categories, has been instrumental in driving significant progress in this field. +% ImageNet served as a catalyst for the rise of large-scale CV models~\cite{Krizhevsky2012, He2016} and remains the most important CV benchmark for more than a decade \cite{Krizhevsky2012,Touvron2022, Wortsman2022, He2016}. % It is used to train and evaluate the best models in the field. +While traditionally, convolutional neural networks (CNNs) have been the go-to architecture in CV, Transformers \cite{Vaswani2017}, particularly the Vision Transformer (ViT) \cite{Dosovitskiy2021}, have emerged as a powerful alternative and go-to architecture, demonstrating +% These attention-based models have demonstrated +superior performance in various vision tasks, including image classification \cite{Wortsman2022,Yu2022,Carion2020,Zong2022,Wang2022a}. + -While traditionally, convolutional neural networks (CNNs) have been the go-to architecture for image classification, Transformers \cite{Vaswani2017}, particularly the Vision Transformer (ViT) \cite{Dosovitskiy2021}, have emerged as a powerful alternative. -These attention-based models have demonstrated superior performance in various vision tasks, including image classification \cite{Wortsman2022,Yu2022,Carion2020,Zong2022,Wang2022a}. Data augmentation is a key technique for training image classification models. % A key technique for training image classification models, especially with limited data, is data augmentation. -Traditional data augmentation methods, such as random cropping, flipping, and color jittering, are commonly employed to increase the diversity of the training data and improve the model's performance~\cite{Xu2023d, Shorten2019}. -These basic transformations, originally designed for CNNs, change the input images in a way that preserves their semantic meaning~\cite{Alomar2023}. +Traditional augmentation methods, such as cropping, flipping, or color shifts, are commonly employed to increase data diversity~\cite{Xu2023d, Shorten2019}, but remain bound to existing image compositions. +While these preserve the images' semantic meaning, their ability to teach spatial invariances is limited. +% the diversity of the training data and improve the model's performance~\cite{Xu2023d, Shorten2019}. +% These basic transformations, originally designed for CNNs, change the input images in a way that preserves their semantic meaning~\cite{Alomar2023}, but are limited to existing image compositions. +While combinations of these data augmentations are still used today, they originally were proposed to benefit CNNs. However, the architectural differences of CNNs and Transformers suggest that the latter might benefit from different data augmentation strategies. -In particular, the Transformers self-attention mechanism is not translation equivariant~\cite{RojasGomez2023,Ding2023a}, meaning that the model does not inherently understand the spatial relationships between pixels. +In particular, the self-attention mechanism, unlike a CNN, is not translation equivariant~\cite{RojasGomez2023,Ding2023a}, meaning that the model is not designed to understand the spatial relationships between pixels. % This creates the need for novel data augmentation strategies tailored to the Transformer architecture. % This fact opens a new design space for data augmentation strategies to help Transformers understand the basic invariances of image classification. +% Note that these traditional data augmentations are also limited by existing image compositions. -Inspired by this inductive bias of CNNs, that is not inherent to ViTs, we propose \schemename, a novel data augmentation scheme for image classification which makes the translation equivariance of CNNs explicit in the training data by recombining foreground objects at varying positions with different backgrounds. +Recognizing that Transformers need to learn spatial relationships directly from data, +% and in general are usually trained on larger datasets~\cite{Kolesnikov2020}, +we propose \schemename, a data augmentation method that makes these relationships explicit by recombining foreground objects with diverse backgrounds. +Thus, \schemename goes beyond existing image compositions and encodes desired invariances directly into the training data (see \Cref{fig:fig-1}). +% Inspired by this inductive bias of CNNs, that is not inherent to ViTs, we propose \schemename, a novel data augmentation scheme for image classification which makes the translation equivariance of CNNs explicit in the training data by recombining foreground objects at varying positions with different backgrounds. % In this paper, we address the challenge of effectively training Transformers for image classification by proposing \schemename, a novel data augmentation scheme for image classification, which combines foreground objects with different backgrounds. -Applying \schemename to ImageNet gives rise to \name, a novel dataset that enables this data augmentation with with fine-grained control over the image composition. -Recognizing that Transformers need to learn the spatial relationships from data, since they are not inherently translation invariant, and in general are usually trained on larger datasets~\cite{Kolesnikov2020}, we separate the foreground objects in ImageNet from their backgrounds, using an open-world object detector~\cite{Ren2024}, and fill in the background in a plausible way using an object removal model~\cite{Sun2024,Suvorov2021}. -This allows us to recombine any foreground object with any background on the fly, creating a highly diverse training set. -During recombination, we can control important parameters, like the size and position of the foreground object, to help the model learn the spatial invariances necessary for image classification. -We show that training on \name instead of ImageNet increases the model accuracy of Transformers by up to 4.5 p.p. on ImageNet and an up to $39.3\%$ reduction in error rate on downstream tasks. +% Applying \schemename to ImageNet gives rise to \name, a novel dataset that enables this data augmentation with with fine-grained control over the image composition. +Applying \schemename to a dataset like ImageNet is a two-step process: +(1)~We separate the foreground objects in ImageNet from their backgrounds, using an open-world object detector~\cite{Ren2024} and fill in the background in a neutral way using an object removal model~\cite{Sun2024,Suvorov2021}. +(2)~This allows us to then recombine any foreground object with any background on the fly, creating a highly diverse training set. +% During recombination, we can control important parameters, like the size and position of the foreground object, to help the model learn the spatial invariances necessary for image classification. +By exploiting the control over foreground size and position during recombination, \schemename explicitly teaches spatial invariances that image classification models typically must learn implicitly. +We show that using \schemename additionally to strong traditional data augmentation increases the model accuracy of Transformers by up to 4.5 p.p. on ImageNet and reduces the error rate by up to $7.3$ p.p. in downstream tasks. -Additionally, \schemename is a useful tool for analyzing model behavior and biases, when used during the evaluation phase. -We utilize our control over the image distribution to quantify a model's background robustness (by varying the choice of background), foreground focus (by leveraging our knowledge about the placement of the foreground object), center bias (by controlling the object's position), and size bias (by controlling object size). -These analyses provide insights into model behavior and biases, which is crucial for model deployment and future robustness optimizations. -We show that training on \name, instead of ImageNet, significantly reduces all of these biases, completely removing the models' dependence on the background distribution. -We make our code for \schemename and the \name-dataset publicly available\footnote{\url{https://github.com/tobna/ForAug}} to facilitate further research. +Beyond training, \schemename becomes a diagnostic tool for analyzing model behavior and biases, when used during evaluation. +We utilize our control over the image distribution to measure a model's background robustness (by varying the choice of background), foreground focus (by leveraging our knowledge about the placement of the foreground object), center bias (by controlling position), and size bias (by controlling size). +These analyses provide valuable insights into model behavior and biases, which is crucial for model deployment and future robustness optimizations. +We show that training using \schemename significantly reduces all of these biases. +We make our code for \schemename and the output of \schemename's segmentation phase on ImageNet publicly available\footnote{Link will go here.} to facilitate further research. \subsection*{Contributions} \begin{itemize} - \item We propose \schemename, a novel data augmentation scheme, that recombines objects and backgrounds to train Transformers for image classification. - \item We show that training on \name, the ImageNet instantiation of \schemename, leads to 4.5 p.p. improved accuracy on ImageNet and 7.3 p.p. on downstream tasks. - \item We propose novel \schemename-based metrics to analyze and quantify fine-grained biases trained models: Background Robustness, Foreground Focus, Center Bias, and Size Bias. Training on \name, instead of ImageNet, significantly reduces these biases. + \item We propose \schemename, a novel data augmentation scheme, that recombines objects and backgrounds. \schemename allows us to move beyond the (possibly biased) image compositions in the dataset while preserving label integrity. + \item We show that training a standard ViT using \schemename leads to up to 4.5 p.p. improved accuracy on ImageNet-1k and 7.3 p.p. on downstream tasks. + \item We propose novel \schemename-based metrics to analyze and quantify fine-grained biases of trained models: Background Robustness, Foreground Focus, Center Bias, and Size Bias. We show that \schemename significantly reduces these biases by encoding invariance that benefits ViT into the training data. \end{itemize} \ No newline at end of file diff --git a/sec/method.tex b/sec/method.tex index 9d9cda8..7515ce7 100644 --- a/sec/method.tex +++ b/sec/method.tex @@ -1,6 +1,13 @@ % !TeX root = ../main.tex -\section{RecombiNet (Method)} +%\begin{figure*}[ht!] +% \centering +% \includegraphics[width=.9\textwidth]{img/fig-2.pdf} +% \caption{Overview of \name. The data creation consists of two stages: (1, offline) Segmentation, where we segment the foreground objects from the background and fill in the background. (2, online) Recombination, where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.} +% \label{fig:method} +%\end{figure*} + +\section{\schemename (Method)} \label{sec:method} % \begin{itemize} @@ -19,21 +26,21 @@ % \item Dealing with other data augmentations/transformations % \end{itemize} -\begin{figure*} - \centering - \includegraphics[width=\textwidth]{img/fig-2.pdf} - \caption{Overview of \name. The data creation consists of two stages: (1, offline) Segmentation, where we segment the foreground objects from the background and fill in the background. (2, online) Recombination, where we combine the foreground objects with different backgrounds to create new samples.} - \label{fig:method} -\end{figure*} - % We propose a novel dataset, called \name, that improves image classification performance by explicitly separating and recombining foreground objects and plain backgrounds. % \name consists of two stages: Segmentation and recombination. Both are visualized in \Cref{fig:method}. -We introduce \schemename, a data augmentation scheme designed to enhance Transformer training by explicitly separating and recombining foreground objects and backgrounds. -\schemename involves two stages: Segmentation and Recombination, both visualized in \Cref{fig:method}. +% We introduce \schemename, a data augmentation scheme designed to enhance Transformer training by explicitly separating and recombining foreground objects and backgrounds. +% \schemename enhances transformer training by explicitly encoding spatial invariances that these need to learn explicitly in the data. +% \schemename involves two stages: Segmentation and Recombination, both visualized in \Cref{fig:method}. +We introduce \schemename, a data augmentation designed to enhance Transformer training by embedding spatial invariances--which Transformers would otherwise need to learn implicitly--directly into the training data. +% It operates by explicitly segmenting and recombining foreground objects and backgrounds. +\schemename comprises two distinct stages: Segmentation and Recombination. Both stages are illustrated in \Cref{fig:method}. -\subsubsection*{Segmentation} + +\subsection{Segmentation} +\label{sec:segmentation} The segmentation stage isolates the foreground objects and their corresponding backgrounds. -We then fill in the background in a visually plausible way~\cite{Sun2024} using a pretrained object-removal model. +% We then fill in the background in a visually plausible way~\cite{Sun2024} using a pretrained object-removal model. +We then fill the background using a pretrained object-removal model, producing visually plausible~\cite{Sun2024}, neutral scenes ready for recombination. This stage is computed once offline and the results are stored for the recombination stage. First, foreground objects are detected and segmented from their backgrounds using a prompt-based segmentation model to exploit the classification datasets labels. @@ -43,32 +50,39 @@ The \code{} guides the segmentation model towards the correct o This can be the case with prompts like ``sorrel'' or ``guenon'', where the more general name ``horse'' or ``monkey'' is more helpful. We derive the \code{} from the WordNet hierarchy, using the immediate hypernym. -We iteratively extract up to $n$ foreground masks for each dataset-image, using different more and more general prompts based on the more general synsets of WordNet (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...). +% We iteratively extract up to $n$ foreground masks for each dataset-image, using different more and more general prompts based on the more general synsets of WordNet (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...). +We iteratively extract $n$ foreground masks for each dataset-image, creating prompts by going one hypernym up the WordNet-tree each step (e.g. ``a sorrel, a type of horse'', ``a horse, a type of equine'', ...). Masks that are very similar, with a pairwise IoU of at least $0.9$, are merged. The output is a set of masks delineating the foreground objects and the backgrounds. We select the best mask per image (according to \Cref{eq:filtering-score}) in a later filtering step, described below. -An inpainting model that is specifically optimized to remove objects from images, such as LaMa~\cite{Suvorov2021} or Attentive Eraser~\cite{Sun2024}, is used to inpaint the foreground regions in the backgrounds. -To ensure the quality of the foreground and background images (for each dataset-image), we select a foreground/background pair from the $\leq n$ variants we have extracted and infilled in the previous steps. -Using an ensemble of six ViT, ResNet, and Swin Transformer models pretrained on the original dataset, we select the foreground/background pair that maximizes foreground performance while minimizing the performance on the background and size of the foreground according to: +First, an inpainting model that is specifically optimized to remove objects from images, such as LaMa~\cite{Suvorov2021} or Attentive Eraser~\cite{Sun2024}, is used to inpaint the foreground regions in the backgrounds. +Then, to ensure the quality of the foregrounds and the neutral background images, we select a foreground/background pair (for each dataset-image) from the $\leq n$ variants we have extracted and infilled in the previous steps. +Using an ensemble $E$ of six ViT, ResNet, and Swin Transformer models pretrained on the original dataset, we select the foreground/background pair that maximizes foreground performance while minimizing the performance on the background and size of the foreground. +For each model $m \in E$, we predict the score of the ground truth class $c$ on the foreground $\mathrm{fg}$ and background $\mathrm{bg}$ and weigh these with the size $\operatorname{size}(\cdot)$ in number of pixels according to: +% $c$ is the correct foreground class, $\mathrm{fg}$, and $\mathrm{bg}$ are the foreground and background and $\operatorname{size}(\cdot)$ is the size in number of pixels. \begin{align} \begin{split} \label{eq:filtering-score} \text{score}(\mathrm{fg}, \mathrm{bg}, c) &= \log \left( \frac{1}{\abs{E}} \sum_{m \in E} \P[m(\mathrm{fg}) = c] \right) \\ & + \log \left( 1 - \frac{1}{\abs E} \sum_{m \in E} \P[m(\mathrm{bg}) = c] \right) \\ & + \lambda \log \left( 1 - \abs{\frac{\operatorname{size}(\mathrm{fg})}{\operatorname{size}(\mathrm{bg})} - \eps} \right). \end{split} \end{align} -Here, $E$ is the ensemble of models and $m$ is a pretrained model, $c$ is the correct foreground class, $\mathrm{fg}$, and $\mathrm{bg}$ are the foreground and background and $\operatorname{size}(\cdot)$ is the size in number of pixels. -We ran a hyperparameter search using a manually annotated subset of foreground/background variants to find the factors in \Cref{eq:filtering-score}: $\lambda = 2$ and $\eps = 0.1$. -The \textit{optimal foreground size} of $10\%$ of the full image balances the smallest possible foreground size that encompasses all the respective class information in the image with still conveying the foreground information after pasting it onto another background. -This filtering step ensures we segment all the relevant foreground objects. +% We use $E$ is the ensemble of models and $m$ is a pretrained model, $c$ is the correct foreground class, $\mathrm{fg}$, and $\mathrm{bg}$ are the foreground and background and $\operatorname{size}(\cdot)$ is the size in number of pixels. +We run a hyperparameter search using a manually annotated subset of foreground/background variants to find the factors in \Cref{eq:filtering-score}: $\lambda = 2$ and $\eps = 0.1$. +% The \textit{optimal foreground size} of $10\%$ of the full image balances the smallest possible foreground size that encompasses all the respective class information in the image with still conveying the foreground information after pasting it onto another background. +% This filtering step ensures we segment all the relevant foreground objects. -Finally, we filter out backgrounds that are more than $80\%$ infilled, as these tend to be overly synthetic, plain and don't carry much information (see \Cref{sec:high-infill-ratio}). -We ablate this choice in \Cref{sec:ablation}. +Finally, we filter out backgrounds that are largely infilled, as these tend to be overly synthetic and do not carry much information (see the supplementary material). +% We ablate this choice in \Cref{sec:ablation}. +% While the computational cost for the segmentation stage is significant, this is a one-time calculation whose results can be reused in subsequent experiments (see the supplementary material for details). +Although the segmentation stage is computational overhead, it is a one-time cost with results that can be reused across experiments (see the supplementary material for details). In summary, we factorize the dataset into a set of foreground objects with a transparent background and a set of diverse backgrounds per class. -The next step is to recombine them as data augmentation before applying common data augmentation operations during training. +The next step is to recombine these, before applying other common data augmentation operations during training. -\subsubsection*{Recombination} -The recombination stage, which is performed online, combines the foreground objects with different backgrounds to create new training samples. -For each object, we follow the pipeline of: Pick an appropriate background, resize it to a fitting size, place it in the background image, smooth the transition edge, and apply other data augmentations. +\subsection{Recombination} +\label{sec:recombination} +The recombination stage, performed online during training, combines the foreground objects with different backgrounds to create new training samples. +For each object, we follow the pipeline of: Pick an appropriate background, resize it to a fitting size, and place it in the background image. +Through this step, we expose the model to variations beyond the image compositions of the dataset. For each foreground object, we sample a background using one of the following strategies: (1) the original image background, (2) the set of backgrounds from the same class, or (3) the set of all possible backgrounds. @@ -76,26 +90,24 @@ These sets are trading off the amount of information the model can learn from th In each epoch, each foreground object is seen exactly once, but a background may appear multiple times. The selected foreground is resized based on its relative size within its original image and the relative size of the original foreground in the selected background image. -The final size is randomly selected from a 30\% range around upper and lower limits ($s_u$ and $s_l$), based on the original sizes: -\begin{align} - s \sim \mathcal U \left[ (1 - 0.3) s_l, (1 + 0.3) s_u \right]. -\end{align} +The final size is randomly selected from a 30\% range around upper and lower limits ($s_u$ and $s_l$), based on the original sizes. +% \begin{align} +% s \sim \mathcal U \left[ (1 - 0.3) s_l, (1 + 0.3) s_u \right]. +% \end{align} To balance the size of the foreground and that of the backgrounds original foreground, the upper and lower limit $s_u$ and $s_l$ are set to the mean or range of both sizes, depending on the foreground size strategy: \emph{mean} or \emph{range}. The resized foreground is then placed at a random position within the background image. -This position is sampled from a generalization of the Bates distribution~\cite{Bates1955} with parameter $\eta \in \N$, visualized in \Cref{fig:bates-pdf}. -We choose the bates distribution, as it presents an easy way to sample from a bounded domain with just one hyperparameter that controls the concentration of the distribution. -$\eta = 1$ corresponds to the uniform distribution; $\eta > 1$ concentrates the distribution around the center; and for $\eta < -1$, the distribution is concentrated at the borders. To more seamlessly integrate the foreground, we apply a Gaussian blur with ${\sigma \in [\frac{\sigma_{\text{max}}}{10}, \sigma_{\text{max}}]}$, inspired by the standard range for the Gaussian blur operation in \cite{Touvron2022}, to the foreground's alpha-mask. We can apply standard data augmentation techniques in two modes: Either we apply all augmentations to the recombined image, or we apply the cropping and resizing to the background only and then apply the other augmentations after recombination. % While for the second mode, the foreground object will always be fully visible, the first mode uses the data augmentations in the same way they would be used for the baseline dataset. -The second mode ensures the foreground object remains fully visible, while the first mode mirrors standard data augmentation practices. - +% The second mode ensures the foreground object remains fully visible, while the first mode mirrors standard data augmentation practices. +The first mode mirrors standard augmentation practice, whereas the second one ensures the foreground object remains fully visible. We experiment with a constant mixing ratio, or a linear or cosine annealing schedule that increases the amount of images from the original dataset over time. The mixing ratio acts as a probability of selecting an image from the original dataset; -otherwise, an image with the same foreground is recombined using \schemename. -Thus, we still ensure each foreground is seen once per epoch. +otherwise, an image with the same foreground is recombined using \schemename, ensuring each object is seen once per epoch. +% Thus, we still ensure each foreground is seen once per epoch. +The recombination stage is designed to be parallelized on the CPU during training and thus does not impact training time (see supplementary material for details). diff --git a/sec/related_work.tex b/sec/related_work.tex index 70895ae..0f494ef 100644 --- a/sec/related_work.tex +++ b/sec/related_work.tex @@ -6,23 +6,33 @@ \paragraph{Data Augmentation for Image Classification} Data augmentation is a crucial technique for improving the performance and generalization of image classification models. Traditional augmentation strategies rely on simple geometric or color-space transformations like cropping, flipping, roatation, blurring, color jittering, or random erasing \cite{Zhong2017} to increase the diversity of the training data without changing their semantic meaning. -With the advent of Transformers, new data augmentation operations like PatchDropout \cite{Liu2022d} have been proposed. +With the advent of Vision Transformers, new data augmentation operations like PatchDropout \cite{Liu2022d} have been proposed. Other transformations like Mixup \cite{Zhang2018a}, CutMix \cite{Yun2019}, or random cropping and patching \cite{Takahashi2018} combine multiple input images. -These simple transformations are usually bundled to form more complex augmentation policies like AutoAugment \cite{Cubuk2018} and RandAugment \cite{Cubuk2019}, which automatically search for optimal augmentation policies or 3-augment \cite{Touvron2022} which is optimized to train a ViT. -For a general overview of data augmentation techniques for image classification, we refer to \cite{Shorten2019, Xu2023d}. +These simple transformations are usually bundled to form more complex augmentation policies like AutoAugment \cite{Cubuk2018} and RandAugment \cite{Cubuk2019}, +% which automatically search for optimal augmentation policies +or 3-augment \cite{Touvron2022} which is optimized to train a ViT. +For a general overview of data augmentation techniques for image classification, we refer to \citet{Shorten2019, Xu2023d}. -We build upon these general augmentation techniques by introducing a novel approach to explicitly separate and recombine foregrounds and backgrounds for image classification. -Our approach is used in tandem with traditional data augmentation techniques to improve model performance and reduce biases. +We build upon these general augmentations by introducing a novel approach to explicitly separate objects and backgrounds for image classification, allowing us to -- unlike these basic transformations -- move beyond dataset image compositions. +Our approach is used additionally to strong traditional techniques to improve performance and reduce biases. \paragraph{Copy-Paste Augmentation} -The copy-paste augmentation \cite{Ghiasi2020}, which is used for object detection \cite{Shermaine2025,Ghiasi2020} and instance segmentation \cite{Werman2021,Ling2022}, involves copying segmented objects from one image and pasting them onto another. -While typically human-annotated segmentation masks are used to extract the foreground objects, other foregound sources have been explored, like 3D models \cite{Hinterstoisser2019} and pretrained object-detection models for use on objects on white background \cite{Dwibedi2017} or synthetic images \cite{Ge2023}. -DeePaste \cite{Werman2021} focuses on using inpainting for a more seamless integration of the pasted object. +The copy-paste augmentation \cite{Ghiasi2020}, which is used only for object detection \cite{Shermaine2025,Ghiasi2020} and instance segmentation \cite{Werman2021,Ling2022}, involves copying segmented objects from one image and pasting them onto another. +While typically human annotated segmentation masks are used to extract the foreground objects, other foregound sources have been explored, like 3D models \cite{Hinterstoisser2019} and pretrained object-detection models for use on objects on white background \cite{Dwibedi2017} or synthetic images \cite{Ge2023}. +% DeePaste \cite{Werman2021} focuses on using inpainting for a more seamless integration of the pasted object. +\cite{Kang2022} apply copy-paste as an alternative to CutMix in image classification, but they do not shift the size or position of the foregrounds and use normal dataset images as backgrounds. -Unlike these methods, \name focuses on image classification. -While for detection and segmentation, objects are pasted onto another image (with a different foreground) or on available or rendered background images of the target scene, we extract foreground objects and fill in the resulting holes in the background in a semantically neutral way. -This way, we can recombine any foreground object with a large variety of neutral backgrounds from natural images, enabling a controlled and diverse manipulation of image composition. +% Unlike these methods, \schemename focuses on image classification. +% While these methods paste objects onto another image (with a different foreground) or on available or rendered background images of the target scene, we extract foreground objects and fill in the resulting holes in the background in a semantically neutral way. +Unlike prior copy-paste methods that overlay objects, \schemename extracts foregrounds and replaces their backgrounds with semantically neutral fills, thereby preserving label integrity while enabling controlled and diverse recombination. +% This way, we are preserving label integrity while also having diverse, neutral backgrounds available for recombination, enabling a controlled and diverse manipulation of image composition. +\begin{figure*}[ht!] + \centering + \includegraphics[width=.9\textwidth]{img/fig-2.pdf} + \caption{Overview of \schemename. The data creation consists of two stages: Segmentation (offline, \Cref{sec:segmentation}), where we segment the foreground objects from the background and fill in the background. Recombination (online, \Cref{sec:recombination}), where we combine the foreground objects with different backgrounds to create new samples. After recombination, we apply strong, commonly used augmentation policies.} + \label{fig:method} +\end{figure*} \paragraph{Model robustness evaluation} Evaluating model robustness to various image variations is critical for understanding and improving model generalization. @@ -30,7 +40,7 @@ Datasets like ImageNet-C \cite{Hendrycks2019} and ImageNet-P \cite{Hendrycks2019 ImageNet-E \cite{Li2023e} evaluates model robustness against a collection of distribution shifts. Other datasets, such as ImageNet-D \cite{Zhang2024f}, focus on varying background, texture, and material, but rely on synthetic data. Stylized ImageNet \cite{Geirhos2018} investigates the impact of texture changes. -ImageNet-9 \cite{Xiao2020} explores background variations using segmented images, but the backgrounds are often artificial. +ImageNet-9 \cite{Xiao2020} explores background variations using segmented images, but backgrounds are often artificial. -In contrast to these existing datasets, which are used only for evaluation, \name provides fine-grained control over foreground object placement, size, and background selection, enabling a precise and comprehensive analysis of specific model biases within the context of a large-scale, real-world image distribution. -As \name also provides controllable training set generation, it goes beyond simply measuring robustness to actively improving it through training. +In contrast to these existing datasets, which are used only for evaluation, \schemename provides fine-grained control over foreground object placement, size, and background selection, enabling a precise and comprehensive analysis of specific model biases within the context of a large-scale, real-world image distribution. +As \schemename also provides controllable training set generation, it goes beyond simply measuring robustness to actively improving it through training. diff --git a/sec/reproducability.tex b/sec/reproducability.tex new file mode 100644 index 0000000..853f516 --- /dev/null +++ b/sec/reproducability.tex @@ -0,0 +1,228 @@ +% !TeX root = ../main.tex + +\makeatletter +% \@ifundefined{isChecklistMainFile}{ +% % We are compiling a standalone document +% \newif\ifreproStandalone +% \reproStandalonetrue +% }{ + % We are being \input into the main paper + \newif\ifreproStandalone + \reproStandalonefalse +% } +\makeatother + +\ifreproStandalone +\documentclass[letterpaper]{article} +\usepackage[submission]{aaai2026} +\setlength{\pdfpagewidth}{8.5in} +\setlength{\pdfpageheight}{11in} +\usepackage{times} +\usepackage{helvet} +\usepackage{courier} +\usepackage{xcolor} +\frenchspacing + +\begin{document} +\fi +\setlength{\leftmargini}{20pt} +\makeatletter\def\@listi{\leftmargin\leftmargini \topsep .5em \parsep .5em \itemsep .5em} +\def\@listii{\leftmargin\leftmarginii \labelwidth\leftmarginii \advance\labelwidth-\labelsep \topsep .4em \parsep .4em \itemsep .4em} +\def\@listiii{\leftmargin\leftmarginiii \labelwidth\leftmarginiii \advance\labelwidth-\labelsep \topsep .4em \parsep .4em \itemsep .4em}\makeatother + +\setcounter{secnumdepth}{0} +\renewcommand\thesubsection{\arabic{subsection}} +\renewcommand\labelenumi{\thesubsection.\arabic{enumi}} + +\newcounter{checksubsection} +\newcounter{checkitem}[checksubsection] + +\newcommand{\checksubsection}[1]{% + \refstepcounter{checksubsection}% + \paragraph{\arabic{checksubsection}. #1}% + \setcounter{checkitem}{0}% +} + +\newcommand{\checkitem}{% + \refstepcounter{checkitem}% + \item[\arabic{checksubsection}.\arabic{checkitem}.]% +} +\newcommand{\question}[2]{\normalcolor\checkitem #1 #2 \color{blue}} +\newcommand{\ifyespoints}[1]{\makebox[0pt][l]{\hspace{-15pt}\normalcolor #1}} + +\section*{Reproducibility Checklist} + +\vspace{1em} +\hrule +\vspace{1em} + +\textbf{Instructions for Authors:} + +This document outlines key aspects for assessing reproducibility. Please provide your input by editing this \texttt{.tex} file directly. + +For each question (that applies), replace the ``Type your response here'' text with your answer. + +\vspace{1em} +\noindent +\textbf{Example:} If a question appears as +% +\begin{center} + \noindent + \begin{minipage}{.9\linewidth} + \ttfamily\raggedright + \string\question \{Proofs of all novel claims are included\} \{(yes/partial/no)\} \\ + Type your response here + \end{minipage} +\end{center} +you would change it to: +\begin{center} + \noindent + \begin{minipage}{.9\linewidth} + \ttfamily\raggedright + \string\question \{Proofs of all novel claims are included\} \{(yes/partial/no)\} \\ + yes + \end{minipage} +\end{center} +% +Please make sure to: +\begin{itemize}\setlength{\itemsep}{.1em} + \item Replace ONLY the ``Type your response here'' text and nothing else. + \item Use one of the options listed for that question (e.g., \textbf{yes}, \textbf{no}, \textbf{partial}, or \textbf{NA}). + \item \textbf{Not} modify any other part of the \texttt{\string\question} command or any other lines in this document.\\ +\end{itemize} + +You can \texttt{\string\input} this .tex file right before \texttt{\string\end\{document\}} of your main file or compile it as a stand-alone document. Check the instructions on your conference's website to see if you will be asked to provide this checklist with your paper or separately. + +\vspace{1em} +\hrule +\vspace{1em} + +% The questions start here + +\checksubsection{General Paper Structure} +\begin{itemize} + + \question{Includes a conceptual outline and/or pseudocode description of AI methods introduced}{(yes/partial/no/NA)} + yes + + \question{Clearly delineates statements that are opinions, hypothesis, and speculation from objective facts and results}{(yes/no)} + yes + + \question{Provides well-marked pedagogical references for less-familiar readers to gain background necessary to replicate the paper}{(yes/no)} + yes + +\end{itemize} +\checksubsection{Theoretical Contributions} +\begin{itemize} + + \question{Does this paper make theoretical contributions?}{(yes/no)} + no + + \ifyespoints{\vspace{1.2em}If yes, please address the following points:} + \begin{itemize} + + \question{All assumptions and restrictions are stated clearly and formally}{(yes/partial/no)} + Type your response here + + \question{All novel claims are stated formally (e.g., in theorem statements)}{(yes/partial/no)} + Type your response here + + \question{Proofs of all novel claims are included}{(yes/partial/no)} + Type your response here + + \question{Proof sketches or intuitions are given for complex and/or novel results}{(yes/partial/no)} + Type your response here + + \question{Appropriate citations to theoretical tools used are given}{(yes/partial/no)} + Type your response here + + \question{All theoretical claims are demonstrated empirically to hold}{(yes/partial/no/NA)} + Type your response here + + \question{All experimental code used to eliminate or disprove claims is included}{(yes/no/NA)} + Type your response here + + \end{itemize} +\end{itemize} + +\checksubsection{Dataset Usage} +\begin{itemize} + + \question{Does this paper rely on one or more datasets?}{(yes/no)} + yes + + \ifyespoints{If yes, please address the following points:} + \begin{itemize} + + \question{A motivation is given for why the experiments are conducted on the selected datasets}{(yes/partial/no/NA)} + yes + + \question{All novel datasets introduced in this paper are included in a data appendix}{(yes/partial/no/NA)} + no + + \question{All novel datasets introduced in this paper will be made publicly available upon publication of the paper with a license that allows free usage for research purposes}{(yes/partial/no/NA)} + yes + + \question{All datasets drawn from the existing literature (potentially including authors' own previously published work) are accompanied by appropriate citations}{(yes/no/NA)} + yes + + \question{All datasets drawn from the existing literature (potentially including authors' own previously published work) are publicly available}{(yes/partial/no/NA)} + yes + + \question{All datasets that are not publicly available are described in detail, with explanation why publicly available alternatives are not scientifically satisficing}{(yes/partial/no/NA)} + NA + + \end{itemize} +\end{itemize} + +\checksubsection{Computational Experiments} +\begin{itemize} + + \question{Does this paper include computational experiments?}{(yes/no)} + yes + + \ifyespoints{If yes, please address the following points:} + \begin{itemize} + + \question{This paper states the number and range of values tried per (hyper-) parameter during development of the paper, along with the criterion used for selecting the final parameter setting}{(yes/partial/no/NA)} + yes + + \question{Any code required for pre-processing data is included in the appendix}{(yes/partial/no)} + yes + + \question{All source code required for conducting and analyzing the experiments is included in a code appendix}{(yes/partial/no)} + yes + + \question{All source code required for conducting and analyzing the experiments will be made publicly available upon publication of the paper with a license that allows free usage for research purposes}{(yes/partial/no)} + yes + + \question{All source code implementing new methods have comments detailing the implementation, with references to the paper where each step comes from}{(yes/partial/no)} + yes + + \question{If an algorithm depends on randomness, then the method used for setting seeds is described in a way sufficient to allow replication of results}{(yes/partial/no/NA)} + yes + + \question{This paper specifies the computing infrastructure used for running experiments (hardware and software), including GPU/CPU models; amount of memory; operating system; names and versions of relevant software libraries and frameworks}{(yes/partial/no)} + yes + + \question{This paper formally describes evaluation metrics used and explains the motivation for choosing these metrics}{(yes/partial/no)} + yes + + \question{This paper states the number of algorithm runs used to compute each reported result}{(yes/no)} + yes + + \question{Analysis of experiments goes beyond single-dimensional summaries of performance (e.g., average; median) to include measures of variation, confidence, or other distributional information}{(yes/no)} + yes + + \question{The significance of any improvement or decrease in performance is judged using appropriate statistical tests (e.g., Wilcoxon signed-rank)}{(yes/partial/no)} + no + + \question{This paper lists all final (hyper-)parameters used for each model/algorithm in the paper’s experiments}{(yes/partial/no/NA)} + yes + + + \end{itemize} +\end{itemize} +\ifreproStandalone +\end{document} +\fi \ No newline at end of file