ForAug/main.bib

@Misc{Liang2022,
  author         = {Liang, Paul Pu and Zadeh, Amir and Morency, Louis-Philippe},
  title          = {Foundations and Recent Trends in Multimodal Machine Learning: Principles, Challenges, and Open Questions},
  doi            = {10.48550/ARXIV.2209.03430},
  url            = {joplin://x-callback-url/openNote?id=fe93b7e173f4478da42f09cbfdd379ea},
  file           = {:Liang2022 - Foundations and Recent Trends in Multimodal Machine Learning_ Principles, Challenges, and Open Questions.pdf:PDF},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2022},
}

@Article{Weston2014,
  author         = {Weston, Jason and Chopra, Sumit and Bordes, Antoine},
  title          = {Memory Networks},
  doi            = {10.48550/ARXIV.1410.3916},
  file           = {:Weston2014 - Memory Networks.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2014},
}

@Misc{Graves2014,
  author         = {Graves, Alex and Wayne, Greg and Danihelka, Ivo},
  title          = {Neural Turing Machines},
  doi            = {10.48550/ARXIV.1410.5401},
  file           = {:Graves2014 - Neural Turing Machines.pdf:PDF},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2014},
}

@Article{LeCun2022,
  author         = {LeCun, Yann},
  title          = {A path towards autonomous machine intelligence},
  file           = {:LeCun2022 - A Path Towards Autonomous Machine Intelligence.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2022},
}

@Article{Bianchi2020,
  author         = {{Bianchi}, Federico and {Rossiello}, Gaetano and {Costabello}, Luca and {Palmonari}, Matteo and {Minervini}, Pasquale},
  title          = {{Knowledge Graph Embeddings and Explainable AI}},
  doi            = {10.48550/arxiv.2004.14843},
  eprint         = {2004.14843},
  archiveprefix  = {arXiv},
  file           = {:Bianchi2020 - Knowledge Graph Embeddings and Explainable AI.pdf:PDF},
  primaryclass   = {cs.AI},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@Article{Hitzler2020,
  author         = {Hitzler, Pascal and Janowicz, Krzysztof and Lecue, Freddy},
  title          = {On the Role of Knowledge Graphs in Explainable AI},
  doi            = {10.3233/SW-190374},
  number         = {1},
  pages          = {41–51},
  volume         = {11},
  address        = {NLD},
  file           = {:Hitzler2020 - On the Role of Knowledge Graphs in Explainable AI.pdf:PDF},
  issue_date     = {2020},
  journal        = {Semant. Web},
  month          = {jan},
  publisher      = {IOS Press},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2020},
}

@InProceedings{Jaderberg2015,
  author         = {Jaderberg, Max and Simonyan, Karen and Zisserman, Andrew and Kavukcuoglu, Koray},
  booktitle      = {Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 2},
  title          = {Spatial Transformer Networks},
  location       = {Montreal, Canada},
  pages          = {2017–2025},
  publisher      = {MIT Press},
  series         = {NIPS'15},
  address        = {Cambridge, MA, USA},
  file           = {:Jaderberg2015 - Spatial Transformer Networks.pdf:PDF},
  numpages       = {9},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2015},
}

@Misc{Chen2022,
  author         = {Chen, Zhenghua and Wu, Min and Chan, Alvin and Li, Xiaoli and Ong, Yew-Soon},
  title          = {A Survey on AI Sustainability: Emerging Trends on Learning Algorithms and Research Challenges},
  doi            = {10.48550/ARXIV.2205.03824},
  file           = {:Chen2022 - A Survey on AI Sustainability_ Emerging Trends on Learning Algorithms and Research Challenges.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Wynsberghe2021,
  author         = {van Wynsberghe, Aimee},
  title          = {Sustainable AI: AI for sustainability and the sustainability of AI},
  doi            = {10.1007/s43681-021-00043-6},
  number         = {3},
  pages          = {213-218},
  volume         = {1},
  file           = {:Wynsberghe2021 - Sustainable AI_ AI for Sustainability and the Sustainability of AI.pdf:PDF},
  journal        = {AI and Ethics},
  month          = {Aug},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2021},
}

@InProceedings{Marino2019,
  author         = {Kenneth Marino and Mohammad Rastegari and Ali Farhadi and Roozbeh Mottaghi},
  booktitle      = {Conference on Computer Vision and Pattern Recognition (CVPR)},
  title          = {OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge},
  file           = {:Marino2019 - OK VQA_ a Visual Question Answering Benchmark Requiring External Knowledge.pdf:PDF},
  groups         = {Datasets},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2019},
}

@Article{Schwenk2022,
  author         = {Dustin Schwenk and Apoorv Khandelwal and Christopher Clark and Kenneth Marino and Roozbeh Mottaghi},
  title          = {A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge},
  file           = {:Schwenk2022 - A OKVQA_ a Benchmark for Visual Question Answering Using World Knowledge.pdf:PDF},
  groups         = {Datasets},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2022},
}

@InProceedings{Gui2022,
  author         = {Gui, Liangke and Wang, Borui and Huang, Qiuyuan and Hauptmann, Alexander and Bisk, Yonatan and Gao, Jianfeng},
  booktitle      = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
  title          = {{KAT}: A Knowledge Augmented Transformer for Vision-and-Language},
  doi            = {10.18653/v1/2022.naacl-main.70},
  pages          = {956--968},
  publisher      = {Association for Computational Linguistics},
  address        = {Seattle, United States},
  file           = {:Gui2022 - KAT_ a Knowledge Augmented Transformer for Vision and Language.pdf:PDF},
  month          = jul,
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Vaswani2017,
  author         = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Lukasz and Polosukhin, Illia},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {Attention is All you Need},
  editor         = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
  publisher      = {Curran Associates, Inc.},
  volume         = {30},
  file           = {:Vaswani2017 - Attention Is All You Need.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2017},
}

@InProceedings{Antol2015,
  author         = {Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh},
  booktitle      = {International Conference on Computer Vision (ICCV)},
  title          = {{VQA}: {V}isual {Q}uestion {A}nswering},
  file           = {:Antol2015 - VQA_ Visual Question Answering.pdf:PDF},
  groups         = {Datasets},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2015},
}

@Article{Raffel2020,
  author         = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
  title          = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
  number         = {140},
  pages          = {1--67},
  volume         = {21},
  file           = {:Raffel2020 - Exploring the Limits of Transfer Learning with a Unified Text to Text Transformer.pdf:PDF},
  journal        = {Journal of Machine Learning Research},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@InProceedings{Izacard2021,
  author         = {Izacard, Gautier and Grave, Edouard},
  booktitle      = {Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume},
  title          = {Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering},
  doi            = {10.18653/v1/2021.eacl-main.74},
  pages          = {874--880},
  publisher      = {Association for Computational Linguistics},
  address        = {Online},
  file           = {:Izacard2021 - Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering.pdf:PDF},
  month          = apr,
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2021},
}

@Misc{Izacard2020,
  author         = {Gautier Izacard and Edouard Grave},
  title          = {Distilling Knowledge from Reader to Retriever for Question Answering},
  doi            = {10.48550/arxiv.2012.04584},
  eprint         = {2012.04584},
  archiveprefix  = {arXiv},
  file           = {:Izacard2020 - Distilling Knowledge from Reader to Retriever for Question Answering.pdf:PDF},
  primaryclass   = {cs.CL},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@InProceedings{Radford2021,
  author         = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
  booktitle      = {Proceedings of the 38th International Conference on Machine Learning},
  title          = {Learning Transferable Visual Models From Natural Language Supervision},
  editor         = {Meila, Marina and Zhang, Tong},
  pages          = {8748--8763},
  publisher      = {PMLR},
  series         = {Proceedings of Machine Learning Research},
  volume         = {139},
  file           = {:Radford2021 - Learning Transferable Visual Models from Natural Language Supervision.pdf:PDF},
  month          = {18--24 Jul},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Yang2022,
  author         = {Yang, Zhengyuan and Gan, Zhe and Wang, Jianfeng and Hu, Xiaowei and Lu, Yumao and Liu, Zicheng and Wang, Lijuan},
  booktitle      = {AAAI},
  title          = {An Empirical Study of GPT-3 for Few-Shot Knowledge-Based VQA},
  file           = {:Yang2022 - An Empirical Study of GPT 3 for Few Shot Knowledge Based VQA.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Misc{Li2022,
  author         = {Li, Chenliang and Xu, Haiyang and Tian, Junfeng and Wang, Wei and Yan, Ming and Bi, Bin and Ye, Jiabo and Chen, Hehong and Xu, Guohai and Cao, Zheng and Zhang, Ji and Huang, Songfang and Huang, Fei and Zhou, Jingren and Si, Luo},
  title          = {mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections},
  doi            = {10.48550/ARXIV.2205.12005},
  file           = {:Li2022 - MPLUG_ Effective and Efficient Vision Language Learning by Cross Modal Skip Connections.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{LeeThorp2022,
  author         = {James P Lee-Thorp and Joshua Ainslie and Ilya Eckstein and Santiago Onta{\~n}{\'o}n},
  booktitle      = {NAACL},
  title          = {FNet: Mixing Tokens with Fourier Transforms},
  file           = {:LeeThorp2022 - FNet_ Mixing Tokens with Fourier Transforms.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Misc{Touvron2021,
  author         = {Touvron, Hugo and Cord, Matthieu and El-Nouby, Alaaeldin and Bojanowski, Piotr and Joulin, Armand and Synnaeve, Gabriel and Jégou, Hervé},
  title          = {Augmenting Convolutional networks with attention-based aggregation},
  doi            = {10.48550/ARXIV.2112.13692},
  file           = {:Touvron2021 - Augmenting Convolutional Networks with Attention Based Aggregation.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Touvron2022,
  author         = {Touvron, Hugo and Cord, Matthieu and J{\'e}gou, Herv{\'e}},
  booktitle      = {Computer Vision -- ECCV 2022},
  title          = {DeiT III: Revenge of the ViT},
  editor         = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni Maria and Hassner, Tal},
  pages          = {516--533},
  publisher      = {Springer Nature Switzerland},
  address        = {Cham},
  file           = {:Touvron2022 - DeiT III_ Revenge of the ViT.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Dosovitskiy2021,
  author         = {Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
  booktitle      = {9th International Conference on Learning Representations, {ICLR} 2021, Virtual Event, Austria, May 3-7, 2021},
  title          = {An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
  publisher      = {OpenReview.net},
  file           = {:Dosovitskiy2021 - An Image Is Worth 16x16 Words_ Transformers for Image Recognition at Scale.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Tolstikhin2021,
  author         = {Ilya Tolstikhin and Neil Houlsby and Alexander Kolesnikov and Lucas Beyer and Xiaohua Zhai and Thomas Unterthiner and Jessica Yung and Andreas Peter Steiner and Daniel Keysers and Jakob Uszkoreit and Mario Lucic and Alexey Dosovitskiy},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {{MLP}-Mixer: An all-{MLP} Architecture for Vision},
  editor         = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan},
  eprint         = {2105.01601},
  pages          = {24261--24272},
  url            = {https://openreview.net/forum?id=EI2KOXKdnP},
  archiveprefix  = {arXiv},
  file           = {:Tolstikhin2021 - MLP Mixer_ an All MLP Architecture for Vision.pdf:PDF},
  primaryclass   = {cs.CV},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Huang2020,
  author         = {Huang, Xiao Shi and Perez, Felipe and Ba, Jimmy and Volkovs, Maksims},
  booktitle      = {Proceedings of the 37th International Conference on Machine Learning},
  title          = {Improving Transformer Optimization Through Better Initialization},
  editor         = {III, Hal Daumé and Singh, Aarti},
  pages          = {4475--4483},
  publisher      = {PMLR},
  series         = {Proceedings of Machine Learning Research},
  volume         = {119},
  file           = {:Huang2020 - Improving Transformer Optimization through Better Initialization.pdf:PDF},
  month          = {13--18 Jul},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2020},
}

@InProceedings{Liang2022a,
  author         = {Youwei Liang and Chongjian Ge and Zhan Tong and Yibing Song and Jue Wang and Pengtao Xie},
  booktitle      = {International Conference on Learning Representations},
  title          = {Not All Patches are What You Need: Expediting Vision Transformers via Token Reorganizations},
  comment        = {EViT},
  file           = {:Liang2022a - Not All Patches Are What You Need_ Expediting Vision Transformers Via Token Reorganizations.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Bartoldson2023,
  author         = {Bartoldson, Brian R and Kailkhura, Bhavya and Blalock, Davis},
  title          = {Compute-efficient deep learning: Algorithmic trends and opportunities},
  number         = {122},
  pages          = {1--77},
  volume         = {24},
  file           = {:Bartoldson2022 - Compute Efficient Deep Learning_ Algorithmic Trends and Opportunities.pdf:PDF},
  journal        = {Journal of Machine Learning Research},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Tay2022,
  author         = {Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler, Donald},
  title          = {Efficient Transformers: A Survey},
  doi            = {10.1145/3530811},
  address        = {New York, NY, USA},
  file           = {:Tay2022 - Efficient Transformers_ a Survey.pdf:PDF},
  journal        = {ACM Comput. Surv.},
  month          = {4},
  publisher      = {Association for Computing Machinery},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Xu2022,
  author         = {Xu, Jiarui and De Mello, Shalini and Liu, Sifei and Byeon, Wonmin and Breuel, Thomas and Kautz, Jan and Wang, Xiaolong},
  title          = {GroupViT: Semantic Segmentation Emerges from Text Supervision},
  doi            = {10.48550/arXiv.2202.11094},
  file           = {:Xu2022 - GroupViT_ Semantic Segmentation Emerges from Text Supervision.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Misc{Selva2022,
  author         = {Selva, Javier and Johansen, Anders S. and Escalera, Sergio and Nasrollahi, Kamal and Moeslund, Thomas B. and Clapés, Albert},
  title          = {Video Transformers: A Survey},
  doi            = {10.48550/ARXIV.2201.05991},
  file           = {:Selva2022 - Video Transformers_ a Survey.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2022},
}

@InProceedings{Dehghani2022,
  author         = {Mostafa Dehghani and Yi Tay and Anurag Arnab and Lucas Beyer and Ashish Vaswani},
  booktitle      = {International Conference on Learning Representations},
  title          = {The Efficiency Misnomer},
  file           = {:Dehghani2022 - The Efficiency Misnomer.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Zhao2021,
  author         = {Bo Zhao and Konda Reddy Mopuri and Hakan Bilen},
  booktitle      = {International Conference on Learning Representations},
  title          = {Dataset Condensation with Gradient Matching},
  file           = {:Zhao2021 - Dataset Condensation with Gradient Matching.pdf:PDF},
  groups         = {Dataset Distillation Survey, Condensed Dataset},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Huang2016,
  author         = {Huang, Gao and Sun, Yu and Liu, Zhuang and Sedra, Daniel and Weinberger, Kilian Q.},
  booktitle      = {Computer Vision -- ECCV 2016},
  title          = {Deep Networks with Stochastic Depth},
  editor         = {Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max},
  pages          = {646--661},
  publisher      = {Springer International Publishing},
  address        = {Cham},
  file           = {:Huang2016 - Deep Networks with Stochastic Depth.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2016},
}

@InProceedings{Hu2018,
  author         = {Hu, Jie and Shen, Li and Sun, Gang},
  booktitle      = {2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  title          = {Squeeze-and-Excitation Networks},
  doi            = {10.1109/CVPR.2018.00745},
  pages          = {7132-7141},
  file           = {:Hu2018 - Squeeze and Excitation Networks.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2018},
}

@Article{Rao2021,
  author         = {Rao, Yongming and Zhao, Wenliang and Zhu, Zheng and Lu, Jiwen and Zhou, Jie},
  title          = {Global Filter Networks for Image Classification},
  volume         = {34},
  file           = {:Rao2021 - Global Filter Networks for Image Classification.pdf:PDF},
  journal        = {Advances in Neural Information Processing Systems},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Misc{Wang2020,
  author         = {{Wang}, Sinong and {Li}, Belinda Z. and {Khabsa}, Madian and {Fang}, Han and {Ma}, Hao},
  title          = {Linformer: Self-Attention with Linear Complexity},
  doi            = {10.48550/arxiv.2006.04768},
  eprint         = {2006.04768},
  archiveprefix  = {arXiv},
  file           = {:Wang2020 - Linformer_ Self Attention with Linear Complexity.pdf:PDF},
  month          = jun,
  primaryclass   = {cs.LG},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@InProceedings{Choromanski2021,
  author         = {Krzysztof Marcin Choromanski and Valerii Likhosherstov and David Dohan and Xingyou Song and Andreea Gane and Tamas Sarlos and Peter Hawkins and Jared Quincy Davis and Afroz Mohiuddin and Lukasz Kaiser and David Benjamin Belanger and Lucy J Colwell and Adrian Weller},
  booktitle      = {International Conference on Learning Representations},
  title          = {Rethinking Attention with Performers},
  file           = {:Choromanski2021 - Rethinking Attention with Performers.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Fournier2023,
  author         = {Fournier, Quentin and Caron, Ga\'{e}tan Marceau and Aloise, Daniel},
  title          = {A Practical Survey on Faster and Lighter Transformers},
  doi            = {10.1145/3586074},
  issn           = {0360-0300},
  address        = {New York, NY, USA},
  file           = {:Fournier2023 - A Practical Survey on Faster and Lighter Transformers.pdf:PDF},
  journal        = {ACM Comput. Surv.},
  month          = {3},
  publisher      = {Association for Computing Machinery},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Marcus2020,
  author        = {{Marcus}, Gary},
  title         = {{The Next Decade in AI: Four Steps Towards Robust Artificial Intelligence}},
  doi           = {10.48550/arxiv.2002.06177},
  eprint        = {2002.06177},
  archiveprefix = {arXiv},
  file          = {:Marcus2020 - The Next Decade in AI_ Four Steps Towards Robust Artificial Intelligence.pdf:PDF},
  keywords      = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, I.2, I.2.6},
  month         = feb,
  primaryclass  = {cs.AI},
  readstatus    = {read},
  year          = {2020},
}

@InProceedings{Palacio2021,
  author     = {Palacio, Sebastian and Engler, Philipp and Hees, Jörn and Dengel, Andreas},
  booktitle  = {2020 25th International Conference on Pattern Recognition (ICPR)},
  title      = {Contextual Classification Using Self-Supervised Auxiliary Models for Deep Neural Networks},
  doi        = {10.1109/ICPR48806.2021.9412175},
  pages      = {8937-8944},
  file       = {:Palacio2021 - Contextual Classification Using Self Supervised Auxiliary Models for Deep Neural Networks.pdf:PDF},
  readstatus = {skimmed},
  year       = {2021},
}

@Article{Yang2021,
  author       = {Yi Yang and Yueting Zhuang and Yunhe Pan},
  date         = {2021-12},
  journaltitle = {Frontiers of Information Technology {\&}amp$\mathsemicolon$ Electronic Engineering},
  title        = {Multiple knowledge representation for big data artificial intelligence: framework, applications, and case studies},
  doi          = {10.1631/fitee.2100463},
  number       = {12},
  pages        = {1551--1558},
  volume       = {22},
  file         = {:Yang2021 - Multiple Knowledge Representation for Big Data Artificial Intelligence_ Framework, Applications, and Case Studies.pdf:PDF},
  publisher    = {Zhejiang University Press},
  readstatus   = {skimmed},
}

@InProceedings{Zhang2018,
  author    = {Zhao Zhang and Fuzhen Zhuang and Meng Qu and Fen Lin and Qing He},
  booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
  date      = {2018},
  title     = {Knowledge Graph Embedding with Hierarchical Relation Structure},
  doi       = {10.18653/v1/d18-1358},
  publisher = {Association for Computational Linguistics},
  file      = {:Zhang2018 - Knowledge Graph Embedding with Hierarchical Relation Structure.pdf:PDF},
}

@Article{Ding2018,
  author       = {Liya Ding},
  date         = {2018},
  journaltitle = {Procedia Computer Science},
  title        = {Human Knowledge in Constructing {AI} Systems {\textemdash} Neural Logic Networks Approach towards an Explainable {AI}},
  doi          = {10.1016/j.procs.2018.08.129},
  pages        = {1561--1570},
  volume       = {126},
  file         = {:Ding2018 - Human Knowledge in Constructing AI Systems _ Neural Logic Networks Approach Towards an Explainable AI.pdf:PDF},
  publisher    = {Elsevier {BV}},
}

@Article{Jinfeng2020,
  author       = {Gao Jinfeng and Sehrish Qummar and Zhang Junming and Yao Ruxian and Fiaz Gul Khan},
  date         = {2020-12},
  journaltitle = {Computational Intelligence and Neuroscience},
  title        = {Ensemble Framework of Deep {CNNs} for Diabetic Retinopathy Detection},
  doi          = {10.1155/2020/8864698},
  editor       = {Elpida Keravnou},
  pages        = {1--11},
  volume       = {2020},
  file         = {:Jinfeng2020 - Ensemble Framework of Deep CNNs for Diabetic Retinopathy Detection.pdf:PDF},
  publisher    = {Hindawi Limited},
}

@InProceedings{CasadoGarcia2020,
  author    = {Casado-Garc{\'\i}a, {\'A}ngela and Heras, J{\'o}nathan},
  booktitle = {ECAI 2020},
  title     = {Ensemble methods for object detection},
  pages     = {2688--2695},
  publisher = {IOS Press},
  file      = {:CasadoGarcia2020 - Ensemble Methods for Object Detection.pdf:PDF},
  year      = {2020},
}

@Misc{Pardo2019,
  author    = {Pardo, Alejandro and Alwassel, Humam and Heilbron, Fabian Caba and Thabet, Ali and Ghanem, Bernard},
  date      = {2019},
  title     = {RefineLoc: Iterative Refinement for Weakly-Supervised Action Localization},
  doi       = {10.48550/ARXIV.1904.00227},
  copyright = {arXiv.org perpetual, non-exclusive license},
  file      = {:Pardo2019 - RefineLoc_ Iterative Refinement for Weakly Supervised Action Localization.pdf:PDF},
  keywords  = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher = {arXiv},
}

@InProceedings{Moon2019,
  author    = {Seungwhan Moon and Pararth Shah and Anuj Kumar and Rajen Subba},
  booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
  date      = {2019},
  title     = {{OpenDialKG}: Explainable Conversational Reasoning with Attention-based Walks over Knowledge Graphs},
  doi       = {10.18653/v1/p19-1081},
  publisher = {Association for Computational Linguistics},
  file      = {:Moon2019 - OpenDialKG_ Explainable Conversational Reasoning with Attention Based Walks Over Knowledge Graphs.pdf:PDF},
}

@InCollection{Azimi2019,
  author    = {Fatemeh Azimi and Federico Raue and Jörn Hees and Andreas Dengel},
  booktitle = {Artificial Neural Networks and Machine Learning {\textendash} {ICANN} 2019: Theoretical Neural Computation},
  date      = {2019},
  title     = {A Reinforcement Learning Approach for Sequential Spatial Transformer Networks},
  doi       = {10.1007/978-3-030-30487-4_45},
  pages     = {585--597},
  publisher = {Springer International Publishing},
  file      = {:Azimi2019 - A Reinforcement Learning Approach for Sequential Spatial Transformer Networks.pdf:PDF},
}

@Misc{Azimi2021,
  author    = {Azimi, Fatemeh and Nies, Jean-Francois Jacques Nicolas and Palacio, Sebastian and Raue, Federico and Hees, Jörn and Dengel, Andreas},
  date      = {2021},
  title     = {Spatial Transformer Networks for Curriculum Learning},
  doi       = {10.48550/ARXIV.2108.09696},
  copyright = {Creative Commons Attribution 4.0 International},
  file      = {:Azimi2021 - Spatial Transformer Networks for Curriculum Learning.pdf:PDF},
  keywords  = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher = {arXiv},
}

@Article{Vinuesa2020,
  author       = {Ricardo Vinuesa and Hossein Azizpour and Iolanda Leite and Madeline Balaam and Virginia Dignum and Sami Domisch and Anna Felländer and Simone Daniela Langhans and Max Tegmark and Francesco Fuso Nerini},
  date         = {2020-01},
  journaltitle = {Nature Communications},
  title        = {The role of artificial intelligence in achieving the Sustainable Development Goals},
  doi          = {10.1038/s41467-019-14108-y},
  number       = {1},
  volume       = {11},
  file         = {:Vinuesa2020 - The Role of Artificial Intelligence in Achieving the Sustainable Development Goals.pdf:PDF},
  publisher    = {Springer Science and Business Media {LLC}},
  readstatus   = {skimmed},
}

@InCollection{Kamath2022,
  author    = {Amita Kamath and Christopher Clark and Tanmay Gupta and Eric Kolve and Derek Hoiem and Aniruddha Kembhavi},
  booktitle = {Lecture Notes in Computer Science},
  date      = {2022},
  title     = {Webly Supervised Concept Expansion for~General Purpose Vision Models},
  doi       = {10.1007/978-3-031-20059-5_38},
  pages     = {662--681},
  publisher = {Springer Nature Switzerland},
  file      = {:Kamath2022 - Webly Supervised Concept Expansion For~General Purpose Vision Models.pdf:PDF},
}

@Article{Xu2023c,
  author         = {Xu, Peng and Zhu, Xiatian and Clifton, David A.},
  title          = {Multimodal Learning With Transformers: A Survey},
  doi            = {10.1109/TPAMI.2023.3275156},
  number         = {10},
  pages          = {12113-12132},
  volume         = {45},
  file           = {:Xu2022a - Multimodal Learning with Transformers_ a Survey.pdf:PDF},
  journal        = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  keywords       = {Transformers;Task analysis;Surveys;Visualization;Taxonomy;Mathematical models;Data models;Multimodal learning;transformer;introductory;taxonomy;deep learning;machine learning},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2023},
}

@Misc{Sevim2022,
  author        = {Sevim, Nurullah and Özyedek, Ege Ozan and Şahinuç, Furkan and Koç, Aykut},
  date          = {2022},
  title         = {Fast-FNet: Accelerating Transformer Encoder Models via Efficient Fourier Layers},
  doi           = {10.48550/ARXIV.2209.12816},
  eprint        = {2209.12816},
  archiveprefix = {arXiv},
  copyright     = {arXiv.org perpetual, non-exclusive license},
  file          = {:Sevim2022 - Fast FNet_ Accelerating Transformer Encoder Models Via Efficient Fourier Layers.pdf:PDF},
  keywords      = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), General Literature (cs.GL), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  publisher     = {arXiv},
  readstatus    = {skimmed},
  year          = {2022},
}

@InProceedings{Xu2020,
  author     = {Hongfei Xu and Qiuhui Liu and Josef van Genabith and Deyi Xiong and Jingyi Zhang},
  booktitle  = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
  date       = {2020},
  title      = {Lipschitz Constrained Parameter Initialization for Deep Transformers},
  doi        = {10.18653/v1/2020.acl-main.38},
  publisher  = {Association for Computational Linguistics},
  file       = {:Xu2020 - Lipschitz Constrained Parameter Initialization for Deep Transformers.pdf:PDF},
  readstatus = {skimmed},
}

@InCollection{Touvron2022a,
  author         = {Hugo Touvron and Matthieu Cord and Alaaeldin El-Nouby and Jakob Verbeek and Herv{\'{e}} J{\'{e}}gou},
  booktitle      = {Lecture Notes in Computer Science},
  date           = {2022},
  title          = {Three Things Everyone Should Know About Vision Transformers},
  doi            = {10.1007/978-3-031-20053-3_29},
  pages          = {497--515},
  publisher      = {Springer Nature Switzerland},
  file           = {:Touvron2022a - Three Things Everyone Should Know about Vision Transformers.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
}

@Misc{Zhang2022,
  author    = {Zhang, Daniel and Maslej, Nestor and Brynjolfsson, Erik and Etchemendy, John and Lyons, Terah and Manyika, James and Ngo, Helen and Niebles, Juan Carlos and Sellitto, Michael and Sakhaee, Ellie and Shoham, Yoav and Clark, Jack and Perrault, Raymond},
  date      = {2022},
  title     = {The AI Index 2022 Annual Report},
  doi       = {10.48550/ARXIV.2205.03468},
  copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file      = {:Zhang2022 - The AI Index 2022 Annual Report.pdf:PDF},
  groups    = {Misc},
  keywords  = {Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher = {arXiv},
}

@InProceedings{Jiang2021,
  author         = {Zihang Jiang and Qibin Hou and Li Yuan and Zhou Daquan and Yujun Shi and Xiaojie Jin and Anran Wang and Jiashi Feng},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {All Tokens Matter: Token Labeling for Training Better Vision Transformers},
  editor         = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan},
  url            = {https://openreview.net/forum?id=2vubO341F_E},
  file           = {:Jiang2021 - All Tokens Matter_ Token Labeling for Training Better Vision Transformers.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Rahate2022,
  author       = {Anil Rahate and Rahee Walambe and Sheela Ramanna and Ketan Kotecha},
  date         = {2022-05},
  journaltitle = {Information Fusion},
  title        = {Multimodal Co-learning: Challenges, applications with datasets, recent advances and future directions},
  doi          = {10.1016/j.inffus.2021.12.003},
  pages        = {203--239},
  volume       = {81},
  file         = {:Rahate2022 - Multimodal Co Learning_ Challenges, Applications with Datasets, Recent Advances and Future Directions.pdf:PDF},
  publisher    = {Elsevier {BV}},
}

@InProceedings{Liu2022,
  author    = {Zhuang Liu and Hanzi Mao and Chao-Yuan Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie},
  booktitle = {2022 {IEEE}/{CVF} Conference on Computer Vision and Pattern Recognition ({CVPR})},
  date      = {2022-06},
  title     = {A {ConvNet} for the 2020s},
  doi       = {10.1109/cvpr52688.2022.01167},
  publisher = {{IEEE}},
  file      = {:Liu2022 - A ConvNet for the 2020s.pdf:PDF},
  keywords  = {ConvNext},
}

@InProceedings{Rao2021a,
  author         = {Rao, Yongming and Zhao, Wenliang and Liu, Benlin and Lu, Jiwen and Zhou, Jie and Hsieh, Cho-Jui},
  booktitle      = {Advances in Neural Information Processing Systems (NeurIPS)},
  title          = {DynamicViT: Efficient Vision Transformers with Dynamic Token Sparsification},
  editor         = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
  pages          = {13937--13949},
  publisher      = {Curran Associates, Inc.},
  url            = {https://proceedings.neurips.cc/paper/2021/file/747d3443e319a22747fbb873e8b2f9f2-Paper.pdf},
  volume         = {34},
  file           = {:Rao2021a - DynamicViT_ Efficient Vision Transformers with Dynamic Token Sparsification.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Yun2019,
  author         = {Sangdoo Yun and Dongyoon Han and Sanghyuk Chun and Seong Joon Oh and Youngjoon Yoo and Junsuk Choe},
  booktitle      = {2019 {IEEE}/{CVF} International Conference on Computer Vision ({ICCV})},
  date           = {2019-10},
  title          = {{CutMix}: Regularization Strategy to Train Strong Classifiers With Localizable Features},
  doi            = {10.1109/iccv.2019.00612},
  publisher      = {{IEEE}},
  file           = {:Yun2019 - CutMix_ Regularization Strategy to Train Strong Classifiers with Localizable Features.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2019},
}

@InProceedings{Touvron2021a,
  author         = {Hugo Touvron and Matthieu Cord and Alexandre Sablayrolles and Gabriel Synnaeve and Herve Jegou},
  booktitle      = {2021 {IEEE}/{CVF} International Conference on Computer Vision ({ICCV})},
  date           = {2021-10},
  title          = {Going deeper with Image Transformers},
  doi            = {10.1109/iccv48922.2021.00010},
  publisher      = {{IEEE}},
  file           = {:Touvron2021a - Going Deeper with Image Transformers.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Touvron2021b,
  author         = {Touvron, Hugo and Cord, Matthieu and Douze, Matthijs and Massa, Francisco and Sablayrolles, Alexandre and Jegou, Herve},
  booktitle      = {Proceedings of the 38th International Conference on Machine Learning},
  title          = {Training data-efficient image transformers \& distillation through attention},
  editor         = {Meila, Marina and Zhang, Tong},
  pages          = {10347--10357},
  publisher      = {PMLR},
  series         = {Proceedings of Machine Learning Research},
  url            = {https://proceedings.mlr.press/v139/touvron21a.html},
  volume         = {139},
  file           = {:Touvron2021b - Training Data Efficient Image Transformers & Distillation through Attention.pdf:PDF},
  month          = {7},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Deng2009,
  author    = {Jia Deng and Wei Dong and Richard Socher and Li-Jia Li and Kai Li and Li Fei-Fei},
  booktitle = {2009 {IEEE} Conference on Computer Vision and Pattern Recognition},
  date      = {2009-06},
  title     = {{ImageNet}: A large-scale hierarchical image database},
  doi       = {10.1109/cvpr.2009.5206848},
  publisher = {{IEEE}},
  file      = {:Deng2009 - ImageNet_ a Large Scale Hierarchical Image Database.pdf:PDF},
  year      = {2009},
}

@InProceedings{Ridnik2021,
  author         = {Tal Ridnik and Emanuel Ben-Baruch and Asaf Noy and Lihi Zelnik-Manor},
  booktitle      = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)},
  title          = {ImageNet-21K Pretraining for the Masses},
  eprint         = {2104.10972},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  url            = {https://openreview.net/forum?id=Zkj_VcZ6ol},
  archiveprefix  = {arXiv},
  file           = {:Ridnik2021 - ImageNet 21K Pretraining for the Masses.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Zhang2018a,
  author    = {Hongyi Zhang and Moustapha Cisse and Yann N. Dauphin and David Lopez-Paz},
  booktitle = {International Conference on Learning Representations},
  title     = {mixup: Beyond Empirical Risk Minimization},
  url       = {https://openreview.net/forum?id=r1Ddp1-Rb},
  file      = {:Zhang2018a - Mixup_ beyond Empirical Risk Minimization.pdf:PDF},
  year      = {2018},
}

@Article{Wang2018,
  author         = {Tongzhou Wang and Jun-Yan Zhu and Antonio Torralba and Alexei A. Efros},
  date           = {2018-11-27},
  title          = {Dataset Distillation},
  doi            = {10.48550/arxiv.1811.10959},
  eprint         = {1811.10959},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  file           = {:Wang2018 - Dataset Distillation.pdf:PDF},
  groups         = {Dataset Distillation Survey, Condensed Dataset},
  journal        = {arXiv preprint arXiv:1811.10959},
  keywords       = {cs.LG, stat.ML},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2018},
}

@Article{Bohdal2020,
  author         = {Ondrej Bohdal and Yongxin Yang and Timothy Hospedales},
  date           = {2020-06-15},
  title          = {Flexible Dataset Distillation: Learn Labels Instead of Images},
  doi            = {10.48550/arxiv.2006.08572},
  eprint         = {2006.08572},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  comment        = {Presented at NeurIPS 2020},
  file           = {:Bohdal2020 - Flexible Dataset Distillation_ Learn Labels Instead of Images.pdf:PDF},
  groups         = {Dataset Distillation Survey, Condensed Dataset},
  keywords       = {cs.LG, stat.ML},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@InProceedings{Nguyen2021a,
  author         = {Timothy Nguyen and Roman Novak and Lechao Xiao and Jaehoon Lee},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {Dataset Distillation with Infinitely Wide Convolutional Networks},
  editor         = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan},
  url            = {https://openreview.net/forum?id=hXWPpJedrVP},
  file           = {:Nguyen2021a - Dataset Distillation with Infinitely Wide Convolutional Networks.pdf:PDF},
  groups         = {Dataset Distillation Survey, Condensed Dataset},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2021},
}

@Article{Zhao2021a,
  author         = {Bo Zhao and Hakan Bilen},
  date           = {2021-02-16},
  journaltitle   = {International Conference on Machine Learning 2021},
  title          = {Dataset Condensation with Differentiable Siamese Augmentation},
  doi            = {10.48550/arxiv.2102.08259},
  eprint         = {2102.08259},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  file           = {:Zhao2021a - Dataset Condensation with Differentiable Siamese Augmentation.pdf:PDF},
  groups         = {Dataset Distillation Survey, Condensed Dataset},
  keywords       = {cs.LG, cs.CV},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@InProceedings{Sucholutsky2021,
  author       = {Ilia Sucholutsky and Matthias Schonlau},
  booktitle    = {2021 International Joint Conference on Neural Networks (IJCNN)},
  date         = {2019-10-06},
  title        = {Soft-Label Dataset Distillation and Text Dataset Distillation},
  doi          = {10.1109/IJCNN52387.2021.9533769},
  eprint       = {1910.02551},
  eprintclass  = {cs.LG},
  eprinttype   = {arXiv},
  organization = {IEEE},
  pages        = {1--8},
  file         = {:Sucholutsky2021 - Soft Label Dataset Distillation and Text Dataset Distillation.pdf:PDF},
  groups       = {Dataset Distillation Survey, Condensed Dataset},
  keywords     = {cs.LG, cs.AI, stat.ML},
  year         = {2021},
}

@Article{Asano2021,
  author      = {Yuki M. Asano and Aaqib Saeed},
  date        = {2021-12-01},
  title       = {Extrapolating from a Single Image to a Thousand Classes using Distillation},
  doi         = {10.48550/arxiv.2112.00725},
  eprint      = {2112.00725},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  file        = {:Asano2021 - Extrapolating from a Single Image to a Thousand Classes Using Distillation.pdf:PDF},
  groups      = {Dataset Distillation Survey, Condensed Dataset},
  keywords    = {cs.CV},
  readstatus  = {skimmed},
}

@Article{Cazenavette2022,
  author      = {George Cazenavette and Tongzhou Wang and Antonio Torralba and Alexei A. Efros and Jun-Yan Zhu and George Cazenavette and Tongzhou Wang and Antonio Torralba and Alexei A. Efros and Jun-Yan Zhu},
  date        = {2022-03-22},
  title       = {Dataset Distillation by Matching Training Trajectories},
  doi         = {10.48550/arxiv.2203.11932},
  eprint      = {2203.11932},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  booktitle   = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  file        = {:Cazenavette2022 - Dataset Distillation by Matching Training Trajectories.pdf:PDF},
  groups      = {Dataset Distillation Survey, Condensed Dataset},
  keywords    = {cs.CV, cs.AI, cs.LG},
  readstatus  = {skimmed},
  year        = {2022},
}

@InProceedings{Cazenavette2022a,
  author     = {G. Cazenavette and T. Wang and A. Torralba and A. A. Efros and J. Zhu},
  booktitle  = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)},
  title      = {Wearable ImageNet: Synthesizing Tileable Textures via Dataset Distillation},
  doi        = {10.1109/CVPRW56347.2022.00252},
  pages      = {2277-2281},
  publisher  = {IEEE Computer Society},
  address    = {Los Alamitos, CA, USA},
  file       = {:Cazenavette2022a - Wearable ImageNet_ Synthesizing Tileable Textures Via Dataset Distillation.pdf:PDF},
  groups     = {Dataset Distillation Survey, Condensed Dataset},
  month      = {jun},
  readstatus = {skimmed},
  year       = {2022},
}

@InProceedings{Wang2022,
  author    = {Wang, Kai and Zhao, Bo and Peng, Xiangyu and Zhu, Zheng and Yang, Shuo and Wang, Shuo and Huang, Guan and Bilen, Hakan and Wang, Xinchao and You, Yang},
  booktitle = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title     = {CAFE: Learning to Condense Dataset by Aligning Features},
  doi       = {10.1109/CVPR52688.2022.01188},
  pages     = {12186-12195},
  file      = {:Wang2022 - CAFE_ Learning to Condense Dataset by Aligning Features.pdf:PDF},
  groups    = {Dataset Distillation Survey, Condensed Dataset},
  year      = {2022},
}

@Article{Lee2022,
  author         = {Hae Beom Lee and Dong Bok Lee and Sung Ju Hwang},
  date           = {2022-08-21},
  title          = {Dataset Condensation with Latent Space Knowledge Factorization and Sharing},
  doi            = {10.48550/arxiv.2208.10494},
  eprint         = {2208.10494},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  file           = {:Lee2022 - Dataset Condensation with Latent Space Knowledge Factorization and Sharing.pdf:PDF},
  groups         = {Dataset Distillation Survey, Condensed Dataset},
  keywords       = {cs.LG, cs.AI},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@InProceedings{Zhou2022,
  author    = {Yongchao Zhou and Ehsan Nezhadarya and Jimmy Ba},
  booktitle = {Advances in Neural Information Processing Systems},
  title     = {Dataset Distillation using Neural Feature Regression},
  editor    = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho},
  url       = {https://openreview.net/forum?id=2clwrA2tfik},
  file      = {:Zhou2022 - Dataset Distillation Using Neural Feature Regression.pdf:PDF},
  groups    = {Dataset Distillation Survey, Condensed Dataset},
  year      = {2022},
}

@InProceedings{Liu2022a,
  author    = {Songhua Liu and Kai Wang and Xingyi Yang and Jingwen Ye and Xinchao Wang},
  booktitle = {Advances in Neural Information Processing Systems},
  title     = {Dataset Distillation via Factorization},
  editor    = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho},
  url       = {https://openreview.net/forum?id=luGXvawYWJ},
  file      = {:Liu2022a - Dataset Distillation Via Factorization.pdf:PDF},
  groups    = {Dataset Distillation Survey, Condensed Dataset},
  year      = {2022},
}

@InProceedings{Kim2022,
  author    = {Kim, Jang-Hyun and Kim, Jinuk and Oh, Seong Joon and Yun, Sangdoo and Song, Hwanjun and Jeong, Joonhyun and Ha, Jung-Woo and Song, Hyun Oh},
  booktitle = {Proceedings of the 39th International Conference on Machine Learning},
  title     = {Dataset Condensation via Efficient Synthetic-Data Parameterization},
  editor    = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
  pages     = {11102--11118},
  publisher = {PMLR},
  series    = {Proceedings of Machine Learning Research},
  url       = {https://proceedings.mlr.press/v162/kim22c.html},
  volume    = {162},
  file      = {:Kim2022 - Dataset Condensation Via Efficient Synthetic Data Parameterization.pdf:PDF},
  groups    = {Dataset Distillation Survey, Condensed Dataset},
  month     = {17--23 Jul},
  pdf       = {https://proceedings.mlr.press/v162/kim22c/kim22c.pdf},
  year      = {2022},
}

@InProceedings{Lee2022a,
  author      = {Saehyung Lee and Sanghyuk Chun and Sangwon Jung and Sangdoo Yun and Sungroh Yoon},
  booktitle   = {Proceedings of the 39th International Conference on Machine Learning},
  date        = {2022-02-07},
  title       = {Dataset Condensation with Contrastive Signals},
  doi         = {10.48550/arxiv.2202.02916},
  eprint      = {2202.02916},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/2202.02916v3:PDF},
  groups      = {Dataset Distillation Survey, Condensed Dataset},
  keywords    = {cs.CV, cs.LG},
}

@InProceedings{Thilak2022,
  author         = {Vimal Thilak and Etai Littwin and Shuangfei Zhai and Omid Saremi and Roni Paiss and Joshua Susskind and Vimal Thilak and Etai Littwin and Shuangfei Zhai and Omid Saremi and Roni Paiss and Joshua M. Susskind},
  booktitle      = {Has it Trained Yet? NeurIPS 2022 Workshop},
  date           = {2022-06-10},
  title          = {The Slingshot Mechanism: An Empirical Study of Adaptive Optimizers and the Grokking Phenomenon},
  doi            = {10.48550/arxiv.2206.04817},
  eprint         = {2206.04817},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  url            = {https://openreview.net/forum?id=lY1e0PNkSJ},
  file           = {:Thilak2022 - The Slingshot Mechanism_ an Empirical Study of Adaptive Optimizers and the Grokking Phenomenon.pdf:PDF},
  keywords       = {cs.LG, math.OC},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Katharopoulos2018,
  author         = {Katharopoulos, Angelos and Fleuret, Fran{\c{c}}ois},
  booktitle      = {International conference on machine learning},
  title          = {Not all samples are created equal: Deep learning with importance sampling},
  organization   = {PMLR},
  pages          = {2525--2534},
  file           = {:Katharopoulos2018 - Not All Samples Are Created Equal_ Deep Learning with Importance Sampling.pdf:PDF},
  groups         = {Dataset Distillation Survey, Importance Sampling},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2018},
}

@Article{Vodrahalli2018,
  author         = {Kailas Vodrahalli and Ke Li and Jitendra Malik},
  date           = {2018-11-30},
  title          = {Are All Training Examples Created Equal? An Empirical Study},
  doi            = {10.48550/arxiv.1811.12569},
  eprint         = {1811.12569},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  file           = {:Vodrahalli2018 - Are All Training Examples Created Equal_ an Empirical Study.pdf:PDF},
  groups         = {Importance Sampling, Dataset Distillation Survey},
  keywords       = {cs.LG, cs.CV, stat.ML},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@Article{Lapedriza2013,
  author         = {Agata Lapedriza and Hamed Pirsiavash and Zoya Bylinskii and Antonio Torralba},
  date           = {2013-11-25},
  title          = {Are all training examples equally valuable?},
  doi            = {10.48550/arxiv.1311.6510},
  eprint         = {1311.6510},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  file           = {:Lapedriza2013 - Are All Training Examples Equally Valuable_.pdf:PDF},
  groups         = {Importance Sampling, Dataset Distillation Survey},
  keywords       = {cs.CV, cs.LG, stat.ML},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@InProceedings{Coleman2020,
  author         = {Cody Coleman and Christopher Yeh and Stephen Mussmann and Baharan Mirzasoleiman and Peter Bailis and Percy Liang and Jure Leskovec and Matei Zaharia},
  booktitle      = {International Conference on Learning Representations},
  title          = {Selection via Proxy: Efficient Data Selection for Deep Learning},
  url            = {https://openreview.net/forum?id=HJg2b0VYDr},
  file           = {:Coleman2020 - Selection Via Proxy_ Efficient Data Selection for Deep Learning.pdf:PDF},
  groups         = {Dataset Distillation Survey},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2020},
}

@Article{Shleifer2019,
  author         = {Sam Shleifer and Eric Prokop},
  date           = {2019-06-12},
  title          = {Proxy Datasets for Training Convolutional Neural Networks},
  doi            = {10.48550/arxiv.1906.04887},
  eprint         = {1906.04887},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  file           = {:Shleifer2019 - Proxy Datasets for Training Convolutional Neural Networks.pdf:PDF},
  groups         = {Importance Sampling, Dataset Distillation Survey},
  keywords       = {cs.LG, cs.CV, stat.ML},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@Article{Zhangs2019,
  author         = {Cheng Zhangs and Cengiz Öztireli and Stephan Mandt and Giampiero Salvi},
  date           = {2019-07},
  journaltitle   = {Proceedings of the {AAAI} Conference on Artificial Intelligence},
  title          = {Active Mini-Batch Sampling Using Repulsive Point Processes},
  doi            = {10.1609/aaai.v33i01.33015741},
  number         = {01},
  pages          = {5741--5748},
  volume         = {33},
  file           = {:Zhangs2019 - Active Mini Batch Sampling Using Repulsive Point Processes.pdf:PDF},
  groups         = {Dataset Distillation Survey, Importance Sampling},
  publisher      = {Association for the Advancement of Artificial Intelligence ({AAAI})},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@Article{Loshchilov2015,
  author         = {Ilya Loshchilov and Frank Hutter},
  date           = {2015-11-19},
  journaltitle   = {Workshop Track - ICLR 2016},
  title          = {Online Batch Selection for Faster Training of Neural Networks},
  doi            = {10.48550/arxiv.1511.06343},
  eprint         = {1511.06343},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  file           = {:Loshchilov2015 - Online Batch Selection for Faster Training of Neural Networks.pdf:PDF},
  groups         = {Dataset Distillation Survey, Importance Sampling},
  keywords       = {cs.LG, cs.NE, math.OC},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@InProceedings{Fan2018,
  author         = {Yang Fan and Fei Tian and Tao Qin and Xiang-Yang Li and Tie-Yan Liu},
  booktitle      = {International Conference on Learning Representations},
  title          = {Learning to Teach},
  url            = {https://openreview.net/forum?id=HJewuJWCZ},
  file           = {:Fan2018 - Learning to Teach.pdf:PDF},
  groups         = {Dataset Distillation Survey, Importance Sampling},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2018},
}

@InProceedings{Shu2019,
  author         = {Jun Shu and Qi Xie and Lixuan Yi and Qian Zhao and Sanping Zhou and Zongben Xu and Deyu Meng},
  booktitle      = {Advances in Neural Information Processing Systems},
  date           = {2019-02-20},
  title          = {Meta-Weight-Net: Learning an Explicit Mapping For Sample Weighting},
  doi            = {10.48550/arxiv.1902.07379},
  editor         = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
  eprint         = {1902.07379},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  publisher      = {Curran Associates, Inc.},
  url            = {https://proceedings.neurips.cc/paper/2019/file/e58cc5ca94270acaceed13bc82dfedf7-Paper.pdf},
  volume         = {32},
  file           = {:Shu2019 - Meta Weight Net_ Learning an Explicit Mapping for Sample Weighting.pdf:PDF},
  groups         = {Dataset Distillation Survey, Importance Sampling},
  keywords       = {cs.LG, stat.ML},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2019},
}

@InProceedings{Ren2018,
  author         = {Ren, Mengye and Zeng, Wenyuan and Yang, Bin and Urtasun, Raquel},
  booktitle      = {Proceedings of the 35th International Conference on Machine Learning},
  title          = {Learning to Reweight Examples for Robust Deep Learning},
  editor         = {Dy, Jennifer and Krause, Andreas},
  pages          = {4334--4343},
  publisher      = {PMLR},
  series         = {Proceedings of Machine Learning Research},
  url            = {https://proceedings.mlr.press/v80/ren18a.html},
  volume         = {80},
  file           = {:Ren2018 - Learning to Reweight Examples for Robust Deep Learning.pdf:PDF},
  groups         = {Dataset Distillation Survey, Importance Sampling},
  month          = {10--15 Jul},
  pdf            = {http://proceedings.mlr.press/v80/ren18a/ren18a.pdf},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2018},
}

@Article{Elman1993,
  author         = {Jeffrey L. Elman},
  date           = {1993-07},
  journaltitle   = {Cognition},
  title          = {Learning and development in neural networks: The importance of starting small},
  doi            = {10.1016/0010-0277(93)90058-4},
  number         = {1},
  pages          = {71--99},
  volume         = {48},
  file           = {:Elman1993 - Learning and Development in Neural Networks_ the Importance of Starting Small.pdf:PDF},
  groups         = {Importance Sampling, Dataset Distillation Survey},
  publisher      = {Elsevier {BV}},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@InProceedings{Bengio2009,
  author         = {Bengio, Yoshua and Louradour, J\'{e}r\^{o}me and Collobert, Ronan and Weston, Jason},
  booktitle      = {Proceedings of the 26th Annual International Conference on Machine Learning},
  title          = {Curriculum Learning},
  doi            = {10.1145/1553374.1553380},
  isbn           = {9781605585161},
  location       = {Montreal, Quebec, Canada},
  pages          = {41–48},
  publisher      = {Association for Computing Machinery},
  series         = {ICML '09},
  address        = {New York, NY, USA},
  file           = {:Bengio2009 - Curriculum Learning.pdf:PDF},
  groups         = {Importance Sampling, Dataset Distillation Survey},
  numpages       = {8},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2009},
}

@InProceedings{Jiang2017,
  author         = {Lu Jiang and Zhengyuan Zhou and Thomas Leung and Li-Jia Li and Li Fei-Fei},
  date           = {2017-12-14},
  title          = {MentorNet: Learning Data-Driven Curriculum for Very Deep Neural Networks on Corrupted Labels},
  doi            = {10.48550/arxiv.1712.05055},
  eprint         = {1712.05055},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  location       = {Stockholm, Sweden},
  file           = {:Jiang2017 - MentorNet_ Learning Data Driven Curriculum for Very Deep Neural Networks on Corrupted Labels.pdf:PDF},
  groups         = {Dataset Distillation Survey, Importance Sampling},
  journaltitle   = {Proceedings of the 35 th International Conference on MachineLearning},
  keywords       = {cs.CV},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@InProceedings{Lee2021,
  author      = {Sangho Lee and Jiwan Chung and Youngjae Yu and Gunhee Kim and Thomas Breuel and Gal Chechik and Yale Song},
  booktitle   = {ICCV},
  title       = {ACAV100M: Automatic Curation of Large-Scale Datasets for Audio-Visual Video Representation Learning},
  doi         = {10.48550/arxiv.2101.10803},
  eprint      = {2101.10803},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  file        = {:Lee2021 - ACAV100M_ Automatic Curation of Large Scale Datasets for Audio Visual Video Representation Learning.pdf:PDF},
  groups      = {Dataset Distillation Survey, Pruning},
  keywords    = {cs.CV},
  readstatus  = {skimmed},
  year        = {2021},
}

@InProceedings{Li2019,
  author         = {Li, Junnan and Wong, Yongkang and Zhao, Qi and Kankanhalli, Mohan S},
  booktitle      = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  title          = {Learning to learn from noisy labeled data},
  pages          = {5051--5059},
  file           = {:Li2019 - Learning to Learn from Noisy Labeled Data.pdf:PDF},
  groups         = {Dataset Distillation Survey, Noisy Labels},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2019},
}

@Article{Wan2022,
  author      = {Zhijing Wan and Zhixiang Wang and CheukTing Chung and Zheng Wang},
  date        = {2022-10-21},
  title       = {A Survey of Data Optimization for Problems in Computer Vision Datasets},
  doi         = {10.48550/arxiv.2210.11717},
  eprint      = {2210.11717},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  file        = {:Wan2022 - A Survey of Data Optimization for Problems in Computer Vision Datasets.pdf:PDF},
  groups      = {Dataset Distillation Survey, Surveys},
  keywords    = {cs.CV, A.1},
}

@InProceedings{Wortsman2022,
  author    = {Wortsman, Mitchell and Ilharco, Gabriel and Gadre, Samir Ya and Roelofs, Rebecca and Gontijo-Lopes, Raphael and Morcos, Ari S and Namkoong, Hongseok and Farhadi, Ali and Carmon, Yair and Kornblith, Simon and Schmidt, Ludwig},
  booktitle = {Proceedings of the 39th International Conference on Machine Learning},
  title     = {Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time},
  editor    = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
  pages     = {23965--23998},
  publisher = {PMLR},
  series    = {Proceedings of Machine Learning Research},
  url       = {https://proceedings.mlr.press/v162/wortsman22a.html},
  volume    = {162},
  file      = {:Wortsman2022 - Model Soups_ Averaging Weights of Multiple Fine Tuned Models Improves Accuracy without Increasing Inference Time.pdf:PDF},
  month     = {17--23 Jul},
  pdf       = {https://proceedings.mlr.press/v162/wortsman22a/wortsman22a.pdf},
  year      = {2022},
}

@Article{Dwivedi2022,
  author         = {Dwivedi, Vijay Prakash and Joshi, Chaitanya K. and Luu, Anh Tuan and Laurent, Thomas and Bengio, Yoshua and Bresson, Xavier},
  date           = {2022-12-28},
  journaltitle   = {Journal of Machine Learning Research (JMLR), 2022},
  title          = {Benchmarking Graph Neural Networks},
  doi            = {10.48550/ARXIV.2003.00982},
  eprint         = {2003.00982},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Dwivedi2022 - Benchmarking Graph Neural Networks.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@InProceedings{Nguyen2021,
  author         = {Timothy Nguyen and Zhourong Chen and Jaehoon Lee},
  booktitle      = {International Conference on Learning Representations},
  title          = {Dataset Meta-Learning from Kernel Ridge-Regression},
  url            = {https://openreview.net/forum?id=l-PrrQrK0QR},
  file           = {:Nguyen2021 - Dataset Meta Learning from Kernel Ridge Regression.pdf:PDF},
  groups         = {Dataset Distillation Survey, Condensed Dataset},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2021},
}

@Misc{Godbole2023,
  author         = {Varun Godbole and George E. Dahl and Justin Gilmer and Christopher J. Shallue and Zachary Nado},
  title          = {Deep Learning Tuning Playbook},
  note           = {Version 1.0},
  url            = {https://github.com/google-research/tuning_playbook},
  file           = {:Godbole2023 - Deep Learning Tuning Playbook.pdf:PDF},
  groups         = {Misc},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Liu2022b,
  author      = {Liu, Rosanne and Garrette, Dan and Saharia, Chitwan and Chan, William and Roberts, Adam and Narang, Sharan and Blok, Irina and Mical, RJ and Norouzi, Mohammad and Constant, Noah},
  date        = {2022-12-20},
  title       = {Character-Aware Models Improve Visual Text Rendering},
  doi         = {10.48550/ARXIV.2212.10562},
  eprint      = {2212.10562},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Liu2022b - Character Aware Models Improve Visual Text Rendering.pdf:PDF},
  keywords    = {Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  readstatus  = {skimmed},
  year        = {2022},
}

@Article{Peebles2022,
  author         = {Peebles, William and Xie, Saining},
  date           = {2022-12-19},
  title          = {Scalable Diffusion Models with Transformers},
  doi            = {10.48550/ARXIV.2212.09748},
  eprint         = {2212.09748},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Peebles2022 - Scalable Diffusion Models with Transformers.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Strubell2019,
  author         = {Emma Strubell and Ananya Ganesh and Andrew McCallum},
  booktitle      = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
  date           = {2019},
  title          = {Energy and Policy Considerations for Deep Learning in {NLP}},
  doi            = {10.18653/v1/p19-1355},
  publisher      = {Association for Computational Linguistics},
  file           = {:Strubell2019 - Energy and Policy Considerations for Deep Learning in NLP.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
}

@InProceedings{Dao2022,
  author         = {Tri Dao and Daniel Y Fu and Stefano Ermon and Atri Rudra and Christopher Re},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {FlashAttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
  editor         = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho},
  url            = {https://openreview.net/forum?id=H4DqfPSibmx},
  file           = {:Dao2022 - FlashAttention_ Fast and Memory Efficient Exact Attention with IO Awareness.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2022},
}

@InProceedings{Schuster2022,
  author         = {Tal Schuster and Adam Fisch and Jai Gupta and Mostafa Dehghani and Dara Bahri and Vinh Q. Tran and Yi Tay and Donald Metzler},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {Confident Adaptive Language Modeling},
  editor         = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho},
  url            = {https://openreview.net/forum?id=uLYc4L3C81A},
  file           = {:Schuster2022 - Confident Adaptive Language Modeling.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Schuurmans2023,
  author         = {Schuurmans, Dale},
  date           = {2023-01-10},
  title          = {Memory Augmented Large Language Models are Computationally Universal},
  doi            = {10.48550/ARXIV.2301.04589},
  eprint         = {2301.04589},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Schuurmans2023 - Memory Augmented Large Language Models Are Computationally Universal.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), Formal Languages and Automata Theory (cs.FL), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Rahman2023,
  author         = {Rahman, Md Salman and Lee, Wonkwon},
  date           = {2023-01-25},
  title          = {Out of Distribution Performance of State of Art Vision Model},
  doi            = {10.48550/ARXIV.2301.10750},
  eprint         = {2301.10750},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Rahman2023 - Out of Distribution Performance of State of Art Vision Model.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2023},
}

@Article{Cohen2023,
  author      = {Cohen, Roi and Geva, Mor and Berant, Jonathan and Globerson, Amir},
  date        = {2023-01-30},
  title       = {Crawling the Internal Knowledge-Base of Language Models},
  doi         = {10.48550/ARXIV.2301.12810},
  eprint      = {2301.12810},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Cohen2023 - Crawling the Internal Knowledge Base of Language Models.pdf:PDF},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Liu2023,
  author      = {Liu, Bin and Wang, Bang},
  date        = {2023-01-27},
  title       = {Bayesian Self-Supervised Contrastive Learning},
  doi         = {10.48550/ARXIV.2301.11673},
  eprint      = {2301.11673},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Liu2023 - Bayesian Self Supervised Contrastive Learning.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Zhao2021b,
  author       = {Zhao, Bo and Bilen, Hakan},
  date         = {2021-10-08},
  journaltitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision 2023 (WACV)},
  title        = {Dataset Condensation with Distribution Matching},
  doi          = {10.48550/ARXIV.2110.04181},
  eprint       = {2110.04181},
  eprintclass  = {cs.LG},
  eprinttype   = {arXiv},
  copyright    = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file         = {:Zhao2021b - Dataset Condensation with Distribution Matching.pdf:PDF},
  groups       = {Dataset Distillation Survey, Condensed Dataset},
  keywords     = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher    = {arXiv},
  year         = {2021},
}

@InProceedings{Jiang2022,
  author    = {Jiang, Chaoya and Xu, Haiyang and Li, Chenliang and Yan, Ming and Ye, Wei and Zhang, Shikun and Bi, Bin and Huang, Songfang},
  booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
  title     = {{TRIPS}: Efficient Vision-and-Language Pre-training with Text-Relevant Image Patch Selection},
  pages     = {4084--4096},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2022.emnlp-main.273},
  address   = {Abu Dhabi, United Arab Emirates},
  file      = {:Jiang2022 - TRIPS_ Efficient Vision and Language Pre Training with Text Relevant Image Patch Selection.pdf:PDF},
  month     = dec,
  year      = {2022},
}

@InProceedings{Zhuang2023,
  author         = {Zhuang, Bohan and Liu, Jing and Pan, Zizheng and He, Haoyu and Weng, Yuetian and Shen, Chunhua},
  booktitle      = {Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, {IJCAI-23}},
  title          = {A Survey on Efficient Training of Transformers},
  doi            = {10.24963/ijcai.2023/764},
  editor         = {Edith Elkind},
  eprint         = {2302.01107},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  note           = {Survey Track},
  pages          = {6823--6831},
  publisher      = {International Joint Conferences on Artificial Intelligence Organization},
  url            = {https://doi.org/10.24963/ijcai.2023/764},
  archiveprefix  = {arXiv},
  file           = {:Zhuang2023 - A Survey on Efficient Training of Transformers.pdf:PDF},
  month          = {8},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Liu2023a,
  author      = {Liu, Hao and Yan, Wilson and Abbeel, Pieter},
  date        = {2023-02-02},
  title       = {Language Quantized AutoEncoders: Towards Unsupervised Text-Image Alignment},
  doi         = {10.48550/ARXIV.2302.00902},
  eprint      = {2302.00902},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Liu2023a - Language Quantized AutoEncoders_ Towards Unsupervised Text Image Alignment.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Jiao2023,
  author  = {Jiao, Jiayu and Tang, Yu-Ming and Lin, Kun-Yu and Gao, Yipeng and Ma, Jinhua and Wang, Yaowei and Zheng, Wei-Shi},
  title   = {DilateFormer: Multi-Scale Dilated Transformer for Visual Recognition},
  file    = {:Jiao2023 - DilateFormer_ Multi Scale Dilated Transformer for Visual Recognition.pdf:PDF},
  journal = {{IEEE} Transactions on Multimedia},
  year    = {2023},
}

@Article{Han2022,
  author      = {Han, Xing and Ren, Tongzheng and Nguyen, Tan Minh and Nguyen, Khai and Ghosh, Joydeep and Ho, Nhat},
  date        = {2022-10-11},
  title       = {Robustify Transformers with Robust Kernel Density Estimation},
  doi         = {10.48550/ARXIV.2210.05794},
  eprint      = {2210.05794},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Han2022 - Robustify Transformers with Robust Kernel Density Estimation.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@InProceedings{Strubell2019a,
  author     = {Strubell, Emma and Ganesh, Ananya and McCallum, Andrew},
  booktitle  = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
  date       = {2019-06-05},
  title      = {Energy and Policy Considerations for Deep Learning in NLP},
  doi        = {10.18653/v1/P19-1355},
  pages      = {3645--3650},
  publisher  = {Association for Computational Linguistics},
  url        = {https://aclanthology.org/P19-1355},
  address    = {Florence, Italy},
  file       = {:Strubell2019a - Energy and Policy Considerations for Deep Learning in NLP.pdf:PDF},
  keywords   = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  month      = jul,
  readstatus = {read},
  year       = {2019},
}

@InProceedings{Du2022,
  author    = {Jiawei Du and Zhou Daquan and Jiashi Feng and Vincent Tan and Joey Tianyi Zhou},
  booktitle = {Advances in Neural Information Processing Systems},
  title     = {Sharpness-Aware Training for Free},
  editor    = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho},
  url       = {https://openreview.net/forum?id=xK6wRfL2mv7},
  file      = {:Du2022 - Sharpness Aware Training for Free.pdf:PDF},
  priority  = {prio2},
  year      = {2022},
}

@Article{Li2023,
  author         = {Li, Wenzhe and Luo, Hao and Lin, Zichuan and Zhang, Chongjie and Lu, Zongqing and Ye, Deheng},
  date           = {2023-01-08},
  title          = {A Survey on Transformers in Reinforcement Learning},
  doi            = {10.48550/ARXIV.2301.03044},
  eprint         = {2301.03044},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Li2023 - A Survey on Transformers in Reinforcement Learning.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Khan2022,
  author    = {Salman Khan and Muzammal Naseer and Munawar Hayat and Syed Waqas Zamir and Fahad Shahbaz Khan and Mubarak Shah},
  date      = {2022-01},
  title     = {Transformers in Vision: A Survey},
  doi       = {10.1145/3505244},
  number    = {10s},
  pages     = {1--41},
  volume    = {54},
  file      = {:Khan2022 - Transformers in Vision_ a Survey.pdf:PDF},
  journal   = {{ACM} Computing Surveys},
  publisher = {Association for Computing Machinery ({ACM})},
  year      = {2022},
}

@InProceedings{ElNouby2021,
  author         = {El-Nouby, Alaaeldin and Touvron, Hugo and Caron, Mathilde and Bojanowski, Piotr and Douze, Matthijs and Joulin, Armand and Laptev, Ivan and Neverova, Natalia and Synnaeve, Gabriel and Verbeek, Jakob and Jegou, Hervé},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {XCiT: Cross-Covariance Image Transformers},
  doi            = {10.48550/arxiv.2106.09681},
  editor         = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan},
  eprint         = {2106.09681},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:ElNouby2021 - XCiT_ Cross Covariance Image Transformers.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Liu2022c,
  author         = {Liu, Ze and Hu, Han and Lin, Yutong and Yao, Zhuliang and Xie, Zhenda and Wei, Yixuan and Ning, Jia and Cao, Yue and Zhang, Zheng and Dong, Li and Wei, Furu and Guo, Baining},
  booktitle      = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title          = {Swin Transformer V2: Scaling Up Capacity and Resolution},
  doi            = {10.48550/ARXIV.2111.09883},
  eprint         = {2111.09883},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  pages          = {11999-12009},
  publisher      = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Liu2022c - Swin Transformer V2_ Scaling up Capacity and Resolution.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Pan2023,
  author      = {Pan, Zizheng and Cai, Jianfei and Zhuang, Bohan},
  date        = {2023-02-13},
  title       = {Stitchable Neural Networks},
  doi         = {10.48550/ARXIV.2302.06586},
  eprint      = {2302.06586},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Pan2023 - Stitchable Neural Networks.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Dehghani2023,
  author         = {Dehghani, Mostafa and Djolonga, Josip and Mustafa, Basil and Padlewski, Piotr and Heek, Jonathan and Gilmer, Justin and Steiner, Andreas and Caron, Mathilde and Geirhos, Robert and Alabdulmohsin, Ibrahim and Jenatton, Rodolphe and Beyer, Lucas and Tschannen, Michael and Arnab, Anurag and Wang, Xiao and Riquelme, Carlos and Minderer, Matthias and Puigcerver, Joan and Evci, Utku and Kumar, Manoj and van Steenkiste, Sjoerd and Elsayed, Gamaleldin F. and Mahendran, Aravindh and Yu, Fisher and Oliver, Avital and Huot, Fantine and Bastings, Jasmijn and Collier, Mark Patrick and Gritsenko, Alexey and Birodkar, Vighnesh and Vasconcelos, Cristina and Tay, Yi and Mensink, Thomas and Kolesnikov, Alexander and Pavetić, Filip and Tran, Dustin and Kipf, Thomas and Lučić, Mario and Zhai, Xiaohua and Keysers, Daniel and Harmsen, Jeremiah and Houlsby, Neil},
  date           = {2023-02-10},
  title          = {Scaling Vision Transformers to 22 Billion Parameters},
  doi            = {10.48550/ARXIV.2302.05442},
  eprint         = {2302.05442},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Dehghani2023 - Scaling Vision Transformers to 22 Billion Parameters.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2023},
}

@Article{Miyazawa2023,
  author    = {Kazuki Miyazawa and Takayuki Nagai},
  date      = {2023-02},
  title     = {Survey on Multimodal Transformers for Robots},
  doi       = {10.36227/techrxiv.21993317.v1},
  file      = {:Miyazawa2023 - Survey on Multimodal Transformers for Robots.pdf:PDF},
  publisher = {Institute of Electrical and Electronics Engineers ({IEEE})},
}

@InProceedings{Peng2021,
  author      = {Peng, Zhiliang and Huang, Wei and Gu, Shanzhi and Xie, Lingxi and Wang, Yaowei and Jiao, Jianbin and Ye, Qixiang and Z. Peng and W. Huang and S. Gu and L. Xie and Y. Wang and J. Jiao and Q. Ye},
  booktitle   = {2021 IEEE/CVF International Conference on Computer Vision (ICCV)},
  date        = {2021-05-09},
  title       = {Conformer: Local Features Coupling Global Representations for Visual Recognition},
  doi         = {10.1109/ICCV48922.2021.00042},
  eprint      = {2105.03889},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  pages       = {357-366},
  publisher   = {arXiv},
  address     = {Los Alamitos, CA, USA},
  file        = {:Peng2021 - Conformer_ Local Features Coupling Global Representations for Visual Recognition.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  month       = {oct},
  year        = {2021},
}

@Misc{Canziani2016,
  author         = {Canziani, Alfredo and Paszke, Adam and Culurciello, Eugenio},
  date           = {2016-05-24},
  title          = {An Analysis of Deep Neural Network Models for Practical Applications},
  doi            = {10.48550/ARXIV.1605.07678},
  eprint         = {1605.07678},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  archiveprefix  = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Canziani2016 - An Analysis of Deep Neural Network Models for Practical Applications.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2016},
}

@InProceedings{Liu2021,
  author         = {Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
  booktitle      = {2021 IEEE/CVF International Conference on Computer Vision (ICCV)},
  title          = {Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},
  doi            = {10.1109/ICCV48922.2021.00986},
  eprint         = {2103.14030},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  pages          = {9992-10002},
  publisher      = {IEEE Computer Society},
  address        = {Los Alamitos, CA, USA},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Liu2021 - Swin Transformer_ Hierarchical Vision Transformer Using Shifted Windows.pdf:PDF},
  month          = {10},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Su2023,
  author    = {Su, Tong and Song, Chengqun and Cheng, Jun},
  booktitle = {Advances in Guidance, Navigation and Control},
  title     = {Vision Transformer with Information Bottleneck for Fine-Grained Visual Classification},
  doi       = {10.1007/978-981-19-6613-2_391},
  editor    = {Yan, Liang and Duan, Haibin and Deng, Yimin},
  isbn      = {978-981-19-6613-2},
  pages     = {4010--4019},
  publisher = {Springer Nature Singapore},
  address   = {Singapore},
  year      = {2023},
}

@Article{Shinoda2023,
  author       = {Risa Shinoda and Hirokatsu Kataoka and Kensho Hara and Ryozo Noguchi},
  date         = {2023-08},
  journaltitle = {Smart Agricultural Technology},
  title        = {Transformer-based ripeness segmentation for tomatoes},
  doi          = {10.1016/j.atech.2023.100196},
  issn         = {2772-3755},
  pages        = {100196},
  url          = {https://www.sciencedirect.com/science/article/pii/S2772375523000266},
  volume       = {4},
  publisher    = {Elsevier {BV}},
  year         = {2023},
}

@Article{Yao2023,
  author       = {Dazhi Yao and Yunxue Shao},
  date         = {2023-02},
  journaltitle = {Signal, Image and Video Processing},
  title        = {A hierarchical and data-efficient network based on patch-based representation},
  doi          = {10.1007/s11760-023-02488-0},
  file         = {:Yao2023 - A Hierarchical and Data Efficient Network Based on Patch Based Representation.pdf:PDF},
  publisher    = {Springer Science and Business Media {LLC}},
}

@Article{Chen2023,
  author      = {Chen, Xiangning and Liang, Chen and Huang, Da and Real, Esteban and Wang, Kaiyuan and Liu, Yao and Pham, Hieu and Dong, Xuanyi and Luong, Thang and Hsieh, Cho-Jui and Lu, Yifeng and Le, Quoc V.},
  date        = {2023-02-13},
  title       = {Symbolic Discovery of Optimization Algorithms},
  doi         = {10.48550/ARXIV.2302.06675},
  eprint      = {2302.06675},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  url         = {https://github.com/google/automl/tree/master/lion},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Chen2023 - Symbolic Discovery of Optimization Algorithms.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Raissi2023,
  author      = {Raissi, Maziar},
  date        = {2023-01-26},
  title       = {Open Problems in Applied Deep Learning},
  doi         = {10.48550/ARXIV.2301.11316},
  eprint      = {2301.11316},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Raissi2023 - Open Problems in Applied Deep Learning.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), Human-Computer Interaction (cs.HC), Information Retrieval (cs.IR), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Misc{Patro2023,
  author         = {Patro, Badri N. and Agneeswaran, Vijay Srinivas},
  date           = {2023-02-16},
  title          = {Efficiency 360: Efficient Vision Transformers},
  doi            = {10.48550/ARXIV.2302.08374},
  eprint         = {2302.08374},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  archiveprefix  = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file           = {:Patro2023 - Efficiency 360_ Efficient Vision Transformers.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2023},
}

@Article{Nag2023,
  author      = {Nag, Shashank and Datta, Gourav and Kundu, Souvik and Chandrachoodan, Nitin and Beerel, Peter A.},
  date        = {2023-02-17},
  title       = {ViTA: A Vision Transformer Inference Accelerator for Edge Applications},
  doi         = {10.48550/ARXIV.2302.09108},
  eprint      = {2302.09108},
  eprintclass = {cs.AR},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Nag2023 - ViTA_ a Vision Transformer Inference Accelerator for Edge Applications.pdf:PDF},
  keywords    = {Hardware Architecture (cs.AR), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Caron2021,
  author         = {Caron, Mathilde and Touvron, Hugo and Misra, Ishan and Jégou, Hervé and Mairal, Julien and Bojanowski, Piotr and Joulin, Armand},
  booktitle      = {Proceedings of the International Conference on Computer Vision (ICCV)},
  date           = {2021-04-29},
  title          = {Emerging Properties in Self-Supervised Vision Transformers},
  doi            = {10.48550/ARXIV.2104.14294},
  eprint         = {2104.14294},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  publisher      = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Caron2021 - Emerging Properties in Self Supervised Vision Transformers.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2021},
}

@InProceedings{Graham2021,
  author      = {Graham, Ben and El-Nouby, Alaaeldin and Touvron, Hugo and Stock, Pierre and Joulin, Armand and Jégou, Hervé and Douze, Matthijs},
  booktitle   = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title       = {LeViT: a Vision Transformer in ConvNet's Clothing for Faster Inference},
  doi         = {10.48550/arxiv.2104.01136},
  eprint      = {2104.01136},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  pages       = {12259-12269},
  publisher   = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Graham2021 - LeViT_ a Vision Transformer in ConvNet's Clothing for Faster Inference.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  month       = {October},
  year        = {2021},
}

@InProceedings{Recht2019,
  author       = {Recht, Benjamin and Roelofs, Rebecca and Schmidt, Ludwig and Shankar, Vaishaal},
  booktitle    = {International conference on machine learning},
  title        = {Do ImageNet Classifiers Generalize to ImageNet?},
  doi          = {10.48550/arxiv.1902.10811},
  eprint       = {1902.10811},
  eprintclass  = {cs.CV},
  eprinttype   = {arXiv},
  organization = {PMLR},
  pages        = {5389--5400},
  file         = {:Recht2019 - Do ImageNet Classifiers Generalize to ImageNet_.pdf:PDF},
  keywords     = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  readstatus   = {skimmed},
  year         = {2019},
}

@Article{Manzari2023,
  author      = {Manzari, Omid Nejati and Ahmadabadi, Hamid and Kashiani, Hossein and Shokouhi, Shahriar B. and Ayatollahi, Ahmad},
  date        = {2023-02-19},
  title       = {MedViT: A Robust Vision Transformer for Generalized Medical Image Classification},
  doi         = {10.48550/ARXIV.2302.09462},
  eprint      = {2302.09462},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Manzari2023 - MedViT_ a Robust Vision Transformer for Generalized Medical Image Classification.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Luo2021,
  author      = {Luo, Shengjie and Li, Shanda and Cai, Tianle and He, Di and Peng, Dinglan and Zheng, Shuxin and Ke, Guolin and Wang, Liwei and Liu, Tie-Yan},
  booktitle   = {Advances in Neural Information Processing Systems},
  date        = {2021-06-23},
  title       = {Stable, Fast and Accurate: Kernelized Attention with Relative Positional Encoding},
  doi         = {10.48550/arxiv.2106.12566},
  editor      = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan},
  eprint      = {2106.12566},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  url         = {https://openreview.net/forum?id=X7XNPor93uG},
  file        = {:Luo2021 - Stable, Fast and Accurate_ Kernelized Attention with Relative Positional Encoding.pdf:PDF},
  year        = {2021},
}

@Article{Agrawal2023,
  author      = {Agrawal, Siddharth},
  date        = {2023-02-02},
  title       = {Scaling Up Computer Vision Neural Networks Using Fast Fourier Transform},
  doi         = {10.48550/ARXIV.2302.12185},
  eprint      = {2302.12185},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Agrawal2023 - Scaling up Computer Vision Neural Networks Using Fast Fourier Transform.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Chen2021,
  author         = {Chen, Beidi and Dao, Tri and Winsor, Eric and Song, Zhao and Rudra, Atri and Ré, Christopher},
  booktitle      = {Advances in Neural Information Processing Systems},
  date           = {2021-10-28},
  title          = {Scatterbrain: Unifying Sparse and Low-rank Attention Approximation},
  doi            = {10.48550/arxiv.2110.15343},
  editor         = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan},
  eprint         = {2110.15343},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  url            = {https://openreview.net/forum?id=SehIKudiIo1},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Chen2021 - Scatterbrain_ Unifying Sparse and Low Rank Attention Approximation.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Desislavov2023,
  author       = {Radosvet Desislavov and Fernando Mart{\'{\i}}nez-Plumed and Jos{\'{e}} Hern{\'{a}}ndez-Orallo},
  date         = {2023-02},
  journaltitle = {Sustainable Computing: Informatics and Systems},
  title        = {Trends in {AI} inference energy consumption: Beyond the performance-vs-parameter laws of deep learning},
  doi          = {10.1016/j.suscom.2023.100857},
  pages        = {100857},
  file         = {:Desislavov2023 - Trends in AI Inference Energy Consumption_ beyond the Performance Vs Parameter Laws of Deep Learning.pdf:PDF},
  publisher    = {Elsevier {BV}},
}

@InProceedings{Xiong2021,
  author         = {Xiong, Yunyang and Zeng, Zhanpeng and Chakraborty, Rudrasis and Tan, Mingxing and Fung, Glenn and Li, Yin and Singh, Vikas},
  booktitle      = {Proceedings of the AAAI Conference on Artificial Intelligence},
  title          = {Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention},
  doi            = {10.48550/arxiv.2102.03902},
  eprint         = {2102.03902},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  file           = {:Xiong2021 - Nyströmformer_ a Nyström Based Algorithm for Approximating Self Attention.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Koh2023,
  author      = {Koh, Jing Yu and Salakhutdinov, Ruslan and Fried, Daniel},
  date        = {2023-01-31},
  title       = {Grounding Language Models to Images for Multimodal Generation},
  doi         = {10.48550/ARXIV.2301.13823},
  eprint      = {2301.13823},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Koh2023 - Grounding Language Models to Images for Multimodal Generation.pdf:PDF},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Zhai2022,
  author      = {Zhai, Xiaohua and Kolesnikov, Alexander and Houlsby, Neil and Beyer, Lucas},
  booktitle   = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  title       = {Scaling Vision Transformers},
  doi         = {10.48550/arxiv.2106.04560},
  eprint      = {2106.04560},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  pages       = {12104--12113},
  file        = {:Zhai2022 - Scaling Vision Transformers.pdf:PDF},
  year        = {2022},
}

@Article{Yu2022,
  author      = {Yu, Jiahui and Wang, Zirui and Vasudevan, Vijay and Yeung, Legg and Seyedhosseini, Mojtaba and Wu, Yonghui},
  title       = {CoCa: Contrastive Captioners are Image-Text Foundation Models},
  doi         = {10.48550/arxiv.2205.01917},
  eprint      = {2205.01917},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  issn        = {2835-8856},
  url         = {https://openreview.net/forum?id=Ee277P3AYC},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Yu2022 - CoCa_ Contrastive Captioners Are Image Text Foundation Models.pdf:PDF},
  journal     = {Transactions on Machine Learning Research},
  year        = {2022},
}

@InProceedings{Brown2020,
  author      = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
  booktitle   = {Advances in Neural Information Processing Systems},
  title       = {Language Models are Few-Shot Learners},
  doi         = {10.48550/arxiv.2005.14165},
  editor      = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin},
  eprint      = {2005.14165},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  pages       = {1877--1901},
  publisher   = {Curran Associates, Inc.},
  url         = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
  volume      = {33},
  file        = {:Brown2020 - Language Models Are Few Shot Learners.pdf:PDF},
  year        = {2020},
}

@Article{Takashima2023,
  author         = {Takashima, Sora and Hayamizu, Ryo and Inoue, Nakamasa and Kataoka, Hirokatsu and Yokota, Rio},
  date           = {2023-03-02},
  title          = {Visual Atoms: Pre-training Vision Transformers with Sinusoidal Waves},
  doi            = {10.48550/ARXIV.2303.01112},
  eprint         = {2303.01112},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Takashima2023 - Visual Atoms_ Pre Training Vision Transformers with Sinusoidal Waves.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2023},
}

@InProceedings{Vaswani2021,
  author         = {Vaswani, Ashish and Ramachandran, Prajit and Srinivas, Aravind and Parmar, Niki and Hechtman, Blake and Shlens, Jonathon},
  booktitle      = {2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title          = {Scaling Local Self-Attention for Parameter Efficient Visual Backbones},
  doi            = {10.1109/CVPR46437.2021.01270},
  eprint         = {2103.12731},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  pages          = {12889-12899},
  publisher      = {IEEE Computer Society},
  address        = {Los Alamitos, CA, USA},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Vaswani2021 - Scaling Local Self Attention for Parameter Efficient Visual Backbones.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  month          = {6},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Yang2022a,
  author         = {Yang, Jianwei and Li, Chunyuan and Dai, Xiyang and Yuan, Lu and Gao, Jianfeng},
  booktitle      = {Advances in Neural Information Processing Systems},
  date           = {2022-03-22},
  title          = {Focal Modulation Networks},
  doi            = {10.48550/arxiv.2203.11926},
  editor         = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho},
  eprint         = {2203.11926},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  url            = {https://openreview.net/forum?id=ePhEbo039l},
  file           = {:Yang2022a - Focal Modulation Networks.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Yin2022,
  author         = {Yin, Hongxu and Vahdat, Arash and Alvarez, Jose M. and Mallya, Arun and Kautz, Jan and Molchanov, Pavlo},
  booktitle      = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title          = {A-ViT: Adaptive Tokens for Efficient Vision Transformer},
  pages          = {10809-10818},
  file           = {:Yin2022 - A ViT_ Adaptive Tokens for Efficient Vision Transformer.pdf:PDF},
  month          = {6},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Meng2022,
  author      = {Meng, Lingchen and Li, Hengduo and Chen, Bor-Chun and Lan, Shiyi and Wu, Zuxuan and Jiang, Yu-Gang and Lim, Ser-Nam},
  title       = {AdaViT: Adaptive Vision Transformers for Efficient Image Recognition},
  doi         = {10.48550/arxiv.2111.15668},
  eprint      = {2111.15668},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  pages       = {12299-12308},
  file        = {:Meng2022 - AdaViT_ Adaptive Vision Transformers for Efficient Image Recognition.pdf:PDF},
  journal     = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  year        = {2022},
}

@InProceedings{Tay2020,
  author         = {Tay, Yi and Bahri, Dara and Yang, Liu and Metzler, Donald and Juan, Da-Cheng},
  booktitle      = {International Conference on Machine Learning},
  title          = {Sparse Sinkhorn Attention},
  doi            = {10.48550/ARXIV.2002.11296},
  eprint         = {2002.11296},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  organization   = {PMLR},
  pages          = {9438--9447},
  publisher      = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Tay2020 - Sparse Sinkhorn Attention.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@Article{Mehrani2023,
  author         = {Mehrani, Paria and Tsotsos, John K. and Mehrani, Paria and Tsotsos, John K.},
  date           = {2023-03-02},
  title          = {Self-attention in vision transformers performs perceptual grouping, not attention},
  doi            = {10.3389/fcomp.2023.1178450},
  eprint         = {2303.01542},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  issn           = {2624-9898},
  volume         = {5},
  copyright      = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file           = {:Mehrani2023 - Self Attention in Vision Transformers Performs Perceptual Grouping, Not Attention.pdf:PDF},
  journal        = {Frontiers in Computer Science},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Gupta2023,
  author      = {Gupta, Animesh and Hasan, Irtiza and Prasad, Dilip K. and Gupta, Deepak K.},
  date        = {2023-03-03},
  title       = {Data-Efficient Training of CNNs and Transformers with Coresets: A Stability Perspective},
  doi         = {10.48550/ARXIV.2303.02095},
  eprint      = {2303.02095},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Gupta2023 - Data Efficient Training of CNNs and Transformers with Coresets_ a Stability Perspective.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Yu2023,
  author      = {Yu, Lu and Xiang, Wei},
  date        = {2023-03-08},
  title       = {X-Pruner: eXplainable Pruning for Vision Transformers},
  doi         = {10.48550/ARXIV.2303.04935},
  eprint      = {2303.04935},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Yu2023 - X Pruner_ EXplainable Pruning for Vision Transformers.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Chang2023,
  author        = {Chang, Shuning and Wang, Pichao and Lin, Ming and Wang, Fan and Zhang, David Junhao and Jin, Rong and Shou, Mike Zheng},
  booktitle     = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title         = {Making Vision Transformers Efficient From a Token Sparsification View},
  eprint        = {2303.08685},
  eprintclass   = {cs.CV},
  eprinttype    = {arXiv},
  pages         = {6195-6205},
  archiveprefix = {arXiv},
  file          = {:Chang2023 - Making Vision Transformers Efficient from a Token Sparsification View.pdf:PDF},
  month         = {June},
  year          = {2023},
}

@Article{Katharopoulos2020,
  author         = {Katharopoulos, Angelos and Vyas, Apoorv and Pappas, Nikolaos and Fleuret, François},
  title          = {Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention},
  doi            = {10.48550/arxiv.2006.16236},
  eprint         = {2006.16236},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  pages          = {5156--5165},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Katharopoulos2020 - Transformers Are RNNs_ Fast Autoregressive Transformers with Linear Attention.pdf:PDF},
  journal        = {International Conference on Machine Learning},
  organization   = {PMLR},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@InProceedings{Wu2021,
  author         = {Wu, Haiping and Xiao, Bin and Codella, Noel and Liu, Mengchen and Dai, Xiyang and Yuan, Lu and Zhang, Lei},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision},
  date           = {2021-03-29},
  title          = {CvT: Introducing Convolutions to Vision Transformers},
  doi            = {10.48550/arxiv.2103.15808},
  eprint         = {2103.15808},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  pages          = {22--31},
  file           = {:Wu2021 - CvT_ Introducing Convolutions to Vision Transformers.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Nekoozadeh2023,
  author      = {Nekoozadeh, Anahita and Ahmadzadeh, Mohammad Reza and Mardani, Zahra and Mardani, Morteza},
  date        = {2023-03-22},
  title       = {Multiscale Attention via Wavelet Neural Operators for Vision Transformers},
  doi         = {10.48550/ARXIV.2303.12398},
  eprint      = {2303.12398},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Nekoozadeh2023 - Multiscale Attention Via Wavelet Neural Operators for Vision Transformers.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Yao2022,
  author         = {Yao, Ting and Pan, Yingwei and Li, Yehao and Ngo, Chong-Wah and Mei, Tao},
  booktitle      = {Computer Vision -- ECCV 2022},
  title          = {Wave-ViT: Unifying Wavelet and Transformers for Visual Representation Learning},
  doi            = {10.48550/arxiv.2207.04978},
  editor         = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni Maria and Hassner, Tal},
  eprint         = {2207.04978},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  pages          = {328--345},
  publisher      = {Springer Nature Switzerland},
  address        = {Cham},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Yao2022 - Wave ViT_ Unifying Wavelet and Transformers for Visual Representation Learning.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Li2023d,
  author         = {Li, Yanyu and Hu, Ju and Wen, Yang and Evangelidis, Georgios and Salahi, Kamyar and Wang, Yanzhi and Tulyakov, Sergey and Ren, Jian},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title          = {Rethinking Vision Transformers for MobileNet Size and Speed},
  eprint         = {2212.08059},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  pages          = {16889-16900},
  archiveprefix  = {arXiv},
  comment        = {EfficientFormerV2},
  file           = {:Li2022a - Rethinking Vision Transformers for MobileNet Size and Speed.pdf:PDF},
  month          = {October},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Roy2021,
  author         = {Roy, Aurko and Saffar, Mohammad and Vaswani, Ashish and Grangier, David},
  title          = {Efficient Content-Based Sparse Attention with Routing Transformers},
  doi            = {10.1162/tacl_a_00353},
  eprint         = {2003.05997},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  pages          = {53--68},
  url            = {https://aclanthology.org/2021.tacl-1.4},
  volume         = {9},
  address        = {Cambridge, MA},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Roy2021 - Efficient Content Based Sparse Attention with Routing Transformers.pdf:PDF},
  journal        = {Transactions of the Association for Computational Linguistics},
  publisher      = {MIT Press},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Tay2021,
  author         = {Tay, Yi and Bahri, Dara and Metzler, Donald and Juan, Da-Cheng and Zhao, Zhe and Zheng, Che and Tay, Yi and Bahri, Dara and Metzler, Donald and Juan, Da-Cheng and Zhao, Zhe and Zheng, Che},
  booktitle      = {International conference on machine learning},
  title          = {Synthesizer: Rethinking Self-Attention in Transformer Models},
  doi            = {10.48550/arxiv.2005.00743},
  eprint         = {2005.00743},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  organization   = {PMLR},
  pages          = {10183--10192},
  file           = {:Tay2021 - Synthesizer_ Rethinking Self Attention in Transformer Models.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Fedus2022,
  author         = {Fedus, William and Zoph, Barret and Shazeer, Noam},
  title          = {Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity},
  doi            = {10.48550/ARXIV.2101.03961},
  eprint         = {2101.03961},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  number         = {1},
  pages          = {5232--5270},
  volume         = {23},
  file           = {:Fedus2022 - Switch Transformers_ Scaling to Trillion Parameter Models with Simple and Efficient Sparsity.pdf:PDF},
  journal        = {The Journal of Machine Learning Research},
  publisher      = {JMLRORG},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Ryoo2021,
  author         = {Ryoo, Michael S. and Piergiovanni, AJ and Arnab, Anurag and Dehghani, Mostafa and Angelova, Anelia},
  date           = {2021-06-21},
  title          = {TokenLearner: What Can 8 Learned Tokens Do for Images and Videos?},
  doi            = {10.48550/ARXIV.2106.11297},
  eprint         = {2106.11297},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  file           = {:Ryoo2021 - TokenLearner_ What Can 8 Learned Tokens Do for Images and Videos_.pdf:PDF},
  journal        = {NeurIPS 2021},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Choromanski2024,
  author         = {Choromanski, Krzysztof Marcin and Li, Shanda and Likhosherstov, Valerii and Dubey, Kumar Avinava and Luo, Shengjie and He, Di and Yang, Yiming and Sarlos, Tamas and Weingarten, Thomas and Weller, Adrian},
  booktitle      = {AISTATS 2024},
  title          = {Learning a Fourier Transform for Linear Relative Positional Encodings in Transformers},
  eprint         = {2302.01925},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  archiveprefix  = {arXiv},
  file           = {:Choromanski2023 - Learning a Fourier Transform for Linear Relative Positional Encodings in Transformers.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@InProceedings{Bolya2023,
  author         = {Bolya, Daniel and Fu, Cheng-Yang and Dai, Xiaoliang and Zhang, Peizhao and Feichtenhofer, Christoph and Hoffman, Judy},
  booktitle      = {International Conference on Learning Representations},
  title          = {Token Merging: Your ViT But Faster},
  doi            = {10.48550/arxiv.2210.09461},
  eprint         = {2210.09461},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  file           = {:Bolya2023 - Token Merging_ Your ViT but Faster.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@InProceedings{Xu2021,
  author         = {Xu, Weijian and Xu, Yifan and Chang, Tyler and Tu, Zhuowen},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision},
  title          = {Co-Scale Conv-Attentional Image Transformers},
  doi            = {10.48550/arxiv.2104.06399},
  eprint         = {2104.06399},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  pages          = {9981--9990},
  file           = {:Xu2021 - Co Scale Conv Attentional Image Transformers.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Ebert2023,
  author    = {Ebert, Nikolas and Stricker, Didier and Wasenm{\"u}ller, Oliver},
  title     = {PLG-ViT: Vision Transformer with Parallel Local and Global Self-Attention},
  number    = {7},
  pages     = {3447},
  volume    = {23},
  file      = {:Ebert2023 - PLG ViT_ Vision Transformer with Parallel Local and Global Self Attention.pdf:PDF},
  journal   = {Sensors},
  publisher = {MDPI},
  year      = {2023},
}

@Article{Zhang2023,
  author      = {Zhang, Qiming and Zhang, Jing and Xu, Yufei and Tao, Dacheng},
  date        = {2023-03-27},
  title       = {Vision Transformer with Quadrangle Attention},
  doi         = {10.48550/ARXIV.2303.15105},
  eprint      = {2303.15105},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhang2023 - Vision Transformer with Quadrangle Attention.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Ronen2023,
  author      = {Ronen, Tomer and Levy, Omer and Golbert, Avram},
  date        = {2023-04-01},
  title       = {Vision Transformers with Mixed-Resolution Tokenization},
  doi         = {10.48550/ARXIV.2304.00287},
  eprint      = {2304.00287},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Ronen2023 - Vision Transformers with Mixed Resolution Tokenization.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Dai2019,
  author      = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan},
  booktitle   = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
  title       = {Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context},
  doi         = {10.18653/v1/P19-1285},
  eprint      = {1901.02860},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  pages       = {2978--2988},
  publisher   = {Association for Computational Linguistics},
  address     = {Florence, Italy},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Dai2019 - Transformer XL_ Attentive Language Models beyond a Fixed Length Context.pdf:PDF},
  month       = jul,
  year        = {2019},
}

@Article{Togelius2023,
  author      = {Togelius, Julian and Yannakakis, Georgios N.},
  date        = {2023-03-31},
  title       = {Choose Your Weapon: Survival Strategies for Depressed AI Academics},
  doi         = {10.48550/ARXIV.2304.06035},
  eprint      = {2304.06035},
  eprintclass = {cs.OH},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Togelius2023 - Choose Your Weapon_ Survival Strategies for Depressed AI Academics.pdf:PDF},
  keywords    = {Other Computer Science (cs.OH), Computers and Society (cs.CY), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences},
  publisher   = {arXiv},
  readstatus  = {skimmed},
  year        = {2023},
}

@InProceedings{Kim2021,
  author    = {Kyungmin Kim and Bichen Wu and Xiaoliang Dai and Peizhao Zhang and Zhicheng Yan and Peter Vajda and Seon Kim},
  booktitle = {2021 {IEEE}/{CVF} Conference on Computer Vision and Pattern Recognition Workshops ({CVPRW})},
  date      = {2021-06},
  title     = {Rethinking the Self-Attention in Vision Transformers},
  doi       = {10.1109/cvprw53098.2021.00342},
  publisher = {{IEEE}},
  file      = {:Kim2021 - Rethinking the Self Attention in Vision Transformers.pdf:PDF},
  year      = {2021},
}

@Misc{Islam2022,
  author        = {Islam, Khawar},
  date          = {2022-03-03},
  title         = {Recent Advances in Vision Transformer: A Survey and Outlook of Recent Work},
  doi           = {10.48550/ARXIV.2203.01536},
  eprint        = {2203.01536},
  eprintclass   = {cs.CV},
  eprinttype    = {arXiv},
  archiveprefix = {arXiv},
  copyright     = {Creative Commons Attribution 4.0 International},
  file          = {:Islam2022 - Recent Advances in Vision Transformer_ a Survey and Outlook of Recent Work.pdf:PDF},
  keywords      = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher     = {arXiv},
  year          = {2022},
}

@Article{Han2023,
  author    = {K. Han and Y. Wang and H. Chen and X. Chen and J. Guo and Z. Liu and Y. Tang and A. Xiao and C. Xu and Y. Xu and Z. Yang and Y. Zhang and D. Tao},
  title     = {A Survey on Vision Transformer},
  doi       = {10.1109/TPAMI.2022.3152247},
  issn      = {1939-3539},
  number    = {01},
  pages     = {87-110},
  volume    = {45},
  address   = {Los Alamitos, CA, USA},
  file      = {:Han2023 - A Survey on Vision Transformer.pdf:PDF},
  journal   = {IEEE Transactions on Pattern Analysis \& Machine Intelligence},
  keywords  = {transformers;task analysis;encoding;computer vision;computational modeling;visualization;object detection},
  month     = {1},
  publisher = {IEEE Computer Society},
  year      = {2023},
}

@InProceedings{Tay2021a,
  author         = {Tay, Yi and Dehghani, Mostafa and Abnar, Samira and Shen, Yikang and Bahri, Dara and Pham, Philip and Rao, Jinfeng and Yang, Liu and Ruder, Sebastian and Metzler, Donald},
  booktitle      = {International Conference on Learning Representations},
  title          = {Long Range Arena: A Benchmark for Efficient Transformers},
  doi            = {10.48550/arxiv.2011.04006},
  eprint         = {2011.04006},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Tay2021a - Long Range Arena_ a Benchmark for Efficient Transformers.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Ranftl2021,
  author         = {Ranftl, Ren\'e and Bochkovskiy, Alexey and Koltun, Vladlen},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title          = {Vision Transformers for Dense Prediction},
  pages          = {12179-12188},
  url            = {https://openaccess.thecvf.com/content/ICCV2021/html/Ranftl_Vision_Transformers_for_Dense_Prediction_ICCV_2021_paper.html},
  file           = {:Ranftl2021 - Vision Transformers for Dense Prediction.pdf:PDF},
  month          = {October},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Park2022,
  author         = {Park, Namuk and Kim, Songkuk},
  booktitle      = {International Conference on Learning Representations},
  title          = {How Do Vision Transformers Work?},
  doi            = {10.48550/arxiv.2202.06709},
  eprint         = {2202.06709},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  url            = {https://openreview.net/forum?id=D78Go4hVcxO},
  file           = {:Park2022 - How Do Vision Transformers Work_.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Lin2022,
  author       = {Tianyang Lin and Yuxin Wang and Xiangyang Liu and Xipeng Qiu},
  date         = {2022},
  journaltitle = {{AI} Open},
  title        = {A survey of transformers},
  doi          = {10.1016/j.aiopen.2022.10.001},
  issn         = {2666-6510},
  pages        = {111--132},
  volume       = {3},
  file         = {:Lin2022 - A Survey of Transformers.pdf:PDF},
  journal      = {AI Open},
  publisher    = {Elsevier {BV}},
  year         = {2022},
}

@Article{Liu2023b,
  author      = {Liu, Yang and Zhang, Yao and Wang, Yixin and Hou, Feng and Yuan, Jin and Tian, Jiang and Zhang, Yang and Shi, Zhongchao and Fan, Jianping and He, Zhiqiang},
  title       = {A Survey of Visual Transformers},
  doi         = {10.1109/TNNLS.2022.3227717},
  eprint      = {2111.06091},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  pages       = {1-21},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Liu2023b - A Survey of Visual Transformers.pdf:PDF},
  journal     = {IEEE Transactions on Neural Networks and Learning Systems},
  publisher   = {IEEE},
  year        = {2023},
}

@Article{Xu2021a,
  author    = {Yifan Xu and Huapeng Wei and Minxuan Lin and Yingying Deng and Kekai Sheng and Mengdan Zhang and Fan Tang and Weiming Dong and Feiyue Huang and Changsheng Xu},
  date      = {2021-10},
  title     = {Transformers in computational visual media: A survey},
  doi       = {10.1007/s41095-021-0247-3},
  number    = {1},
  pages     = {33--62},
  volume    = {8},
  file      = {:Xu2021a - Transformers in Computational Visual Media_ a Survey.pdf:PDF},
  journal   = {Computational Visual Media},
  publisher = {Springer Science and Business Media {LLC}},
  year      = {2021},
}

@Article{Shamshad2023,
  author    = {Fahad Shamshad and Salman Khan and Syed Waqas Zamir and Muhammad Haris Khan and Munawar Hayat and Fahad Shahbaz Khan and Huazhu Fu},
  date      = {2023-04},
  title     = {Transformers in medical imaging: A survey},
  doi       = {10.1016/j.media.2023.102802},
  pages     = {102802},
  file      = {:Shamshad2023 - Transformers in Medical Imaging_ a Survey.pdf:PDF},
  journal   = {Medical Image Analysis},
  publisher = {Elsevier {BV}},
  year      = {2023},
}

@Article{Aleissaee2023,
  author    = {Abdulaziz Amer Aleissaee and Amandeep Kumar and Rao Muhammad Anwer and Salman Khan and Hisham Cholakkal and Gui-Song Xia and Fahad Shahbaz Khan},
  date      = {2023-03},
  title     = {Transformers in Remote Sensing: A Survey},
  doi       = {10.3390/rs15071860},
  number    = {7},
  pages     = {1860},
  volume    = {15},
  file      = {:Aleissaee2023 - Transformers in Remote Sensing_ a Survey.pdf:PDF},
  journal   = {Remote Sensing},
  publisher = {{MDPI} {AG}},
  year      = {2023},
}

@Article{Selva2023,
  author  = {Selva, Javier and Johansen, Anders S. and Escalera, Sergio and Nasrollahi, Kamal and Moeslund, Thomas B. and Clapés, Albert},
  title   = {Video Transformers: A Survey},
  doi     = {10.1109/TPAMI.2023.3243465},
  pages   = {1-20},
  file    = {:Selva2023 - Video Transformers_ a Survey.pdf:PDF},
  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  year    = {2023},
}

@Article{Li2023b,
  author    = {Jun Li and Junyu Chen and Yucheng Tang and Ce Wang and Bennett A. Landman and S. Kevin Zhou},
  date      = {2023-04},
  title     = {Transforming medical imaging with Transformers? A comparative review of key properties, current progresses, and future perspectives},
  doi       = {10.1016/j.media.2023.102762},
  pages     = {102762},
  volume    = {85},
  file      = {:Li2023b - Transforming Medical Imaging with Transformers_ a Comparative Review of Key Properties, Current Progresses, and Future Perspectives.pdf:PDF},
  journal   = {Medical Image Analysis},
  publisher = {Elsevier {BV}},
  year      = {2023},
}

@Article{He2023,
  author    = {Kelei He and Chen Gan and Zhuoyuan Li and Islem Rekik and Zihao Yin and Wen Ji and Yang Gao and Qian Wang and Junfeng Zhang and Dinggang Shen},
  date      = {2023-02},
  title     = {Transformers in medical image analysis},
  doi       = {10.1016/j.imed.2022.07.002},
  number    = {1},
  pages     = {59--78},
  volume    = {3},
  file      = {:He2023 - Transformers in Medical Image Analysis.pdf:PDF},
  journal   = {Intelligent Medicine},
  publisher = {Elsevier {BV}},
  year      = {2023},
}

@Misc{Yang2022b,
  author        = {Yang, Yuting and Jiao, Licheng and Liu, Xu and Liu, Fang and Yang, Shuyuan and Feng, Zhixi and Tang, Xu},
  date          = {2022-03-24},
  title         = {Transformers Meet Visual Learning Understanding: A Comprehensive Review},
  doi           = {10.48550/ARXIV.2203.12944},
  eprint        = {2203.12944},
  eprintclass   = {cs.CV},
  eprinttype    = {arXiv},
  archiveprefix = {arXiv},
  copyright     = {Creative Commons Attribution 4.0 International},
  file          = {:Yang2022b - Transformers Meet Visual Learning Understanding_ a Comprehensive Review.pdf:PDF},
  keywords      = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher     = {arXiv},
  year          = {2022},
}

@Misc{Latif2023,
  author        = {Latif, Siddique and Zaidi, Aun and Cuayahuitl, Heriberto and Shamshad, Fahad and Shoukat, Moazzam and Qadir, Junaid},
  date          = {2023-03-21},
  title         = {Transformers in Speech Processing: A Survey},
  doi           = {10.48550/ARXIV.2303.11607},
  eprint        = {2303.11607},
  eprintclass   = {cs.CL},
  eprinttype    = {arXiv},
  archiveprefix = {arXiv},
  copyright     = {arXiv.org perpetual, non-exclusive license},
  file          = {:Latif2023 - Transformers in Speech Processing_ a Survey.pdf:PDF},
  keywords      = {Computation and Language (cs.CL), Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  publisher     = {arXiv},
  year          = {2023},
}

@Article{Casola2022,
  author    = {Silvia Casola and Ivano Lauriola and Alberto Lavelli},
  date      = {2022-09},
  title     = {Pre-trained transformers: an empirical comparison},
  doi       = {10.1016/j.mlwa.2022.100334},
  pages     = {100334},
  volume    = {9},
  file      = {:Casola2020 - Pre Trained Transformers_ an Empirical Comparison.pdf:PDF},
  journal   = {Machine Learning with Applications},
  publisher = {Elsevier {BV}},
  year      = {2022},
}

@Misc{Ulhaq2022,
  author        = {Ulhaq, Anwaar and Akhtar, Naveed and Pogrebna, Ganna and Mian, Ajmal},
  date          = {2022-09-13},
  title         = {Vision Transformers for Action Recognition: A Survey},
  doi           = {10.48550/ARXIV.2209.05700},
  eprint        = {2209.05700},
  eprintclass   = {cs.CV},
  eprinttype    = {arXiv},
  archiveprefix = {arXiv},
  copyright     = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file          = {:Ulhaq2022 - Vision Transformers for Action Recognition_ a Survey.pdf:PDF},
  keywords      = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  publisher     = {arXiv},
  year          = {2022},
}

@Article{Ali2023,
  author         = {Ali, Anas M. and Benjdira, Bilel and Koubaa, Anis and El-Shafai, Walid and Khan, Zahid and Boulila, Wadii},
  title          = {Vision Transformers in Image Restoration: A Survey},
  doi            = {10.3390/s23052385},
  issn           = {1424-8220},
  number         = {5},
  url            = {https://www.mdpi.com/1424-8220/23/5/2385},
  volume         = {23},
  article-number = {2385},
  file           = {:Ali2023 - Vision Transformers in Image Restoration_ a Survey.pdf:PDF},
  journal        = {Sensors},
  pubmedid       = {36904589},
  year           = {2023},
}

@Article{Parvaiz2023,
  author    = {Arshi Parvaiz and Muhammad Anwaar Khalid and Rukhsana Zafar and Huma Ameer and Muhammad Ali and Muhammad Moazam Fraz},
  date      = {2023-06},
  title     = {Vision Transformers in medical computer vision{\textemdash}A contemplative retrospection},
  doi       = {10.1016/j.engappai.2023.106126},
  pages     = {106126},
  volume    = {122},
  file      = {:Parvaiz2023 - Vision Transformers in Medical Computer Vision_A Contemplative Retrospection.pdf:PDF},
  journal   = {Engineering Applications of Artificial Intelligence},
  publisher = {Elsevier {BV}},
  year      = {2023},
}

@Article{Wen2022,
  author      = {Wen, Qingsong and Zhou, Tian and Zhang, Chaoli and Chen, Weiqi and Ma, Ziqing and Yan, Junchi and Sun, Liang},
  date        = {2022-02-15},
  title       = {Transformers in Time Series: A Survey},
  doi         = {10.48550/ARXIV.2202.07125},
  eprint      = {2202.07125},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Wen2022 - Transformers in Time Series_ a Survey.pdf:PDF},
  journal     = {In the 32nd International Joint Conference on Artificial Intelligence (IJCAI 2023)},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Signal Processing (eess.SP), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  publisher   = {arXiv},
  year        = {2022},
}

@InProceedings{Li2023a,
  author         = {Li, Wei and Xie, Jiahao and Loy, Chen Change},
  booktitle      = {The Eleventh International Conference on Learning Representations},
  title          = {Correlational Image Modeling for Self-Supervised Visual Pre-Training},
  doi            = {10.48550/arxiv.2303.12670},
  eprint         = {2303.12670},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  url            = {https://openreview.net/forum?id=09hVcSDkea},
  file           = {:Li2023a - Correlational Image Modeling for Self Supervised Visual Pre Training.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Yeh2023,
  author         = {Yeh, Catherine and Chen, Yida and Wu, Aoyu and Chen, Cynthia and Viégas, Fernanda and Wattenberg, Martin},
  date           = {2023-05-04},
  title          = {AttentionViz: A Global View of Transformer Attention},
  doi            = {10.48550/ARXIV.2305.03210},
  eprint         = {2305.03210},
  eprintclass    = {cs.HC},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Yeh2023 - AttentionViz_ a Global View of Transformer Attention.pdf:PDF},
  keywords       = {Human-Computer Interaction (cs.HC), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2023},
}

@InProceedings{Keles2023,
  author         = {Keles, Feyza Duman and Wijewardena, Pruthuvi Mahesakya and Hegde, Chinmay and Keles, Feyza Duman and Wijewardena, Pruthuvi Mahesakya and Hegde, Chinmay},
  booktitle      = {International Conference on Algorithmic Learning Theory},
  title          = {On The Computational Complexity of Self-Attention},
  doi            = {10.48550/arxiv.2209.04881},
  eprint         = {2209.04881},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  organization   = {PMLR},
  pages          = {597--619},
  file           = {:Keles2023 - On the Computational Complexity of Self Attention.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Alabdulmohsin2023,
  author         = {Alabdulmohsin, Ibrahim and Zhai, Xiaohua and Kolesnikov, Alexander and Beyer, Lucas},
  date           = {2023-05-22},
  title          = {Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design},
  doi            = {10.48550/ARXIV.2305.13035},
  eprint         = {2305.13035},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Alabdulmohsin2023 - Getting ViT in Shape_ Scaling Laws for Compute Optimal Model Design.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences, I.2.10; I.2.6},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Abbas2023,
  author      = {Abbas, Amro and Tirumala, Kushal and Simig, Dániel and Ganguli, Surya and Morcos, Ari S.},
  date        = {2023-03-16},
  title       = {SemDeDup: Data-efficient learning at web-scale through semantic deduplication},
  doi         = {10.48550/ARXIV.2303.09540},
  eprint      = {2303.09540},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Abbas2023 - SemDeDup_ Data Efficient Learning at Web Scale through Semantic Deduplication.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Brock2021,
  author         = {Brock, Andrew and De, Soham and Smith, Samuel L. and Simonyan, Karen},
  booktitle      = {International Conference on Machine Learning},
  title          = {High-Performance Large-Scale Image Recognition Without Normalization},
  doi            = {10.48550/arxiv.2102.06171},
  eprint         = {2102.06171},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  organization   = {PMLR},
  pages          = {1059--1071},
  file           = {:Brock2021 - High Performance Large Scale Image Recognition without Normalization.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Scribano2023,
  author         = {Carmelo Scribano and Giorgia Franchini and Marco Prato and Marko Bertogna},
  date           = {2023-02},
  journaltitle   = {Journal of Scientific Computing},
  title          = {{DCT}-Former: Efficient Self-Attention with Discrete Cosine Transform},
  doi            = {10.1007/s10915-023-02125-5},
  number         = {3},
  volume         = {94},
  file           = {:Scribano2023 - DCT Former_ Efficient Self Attention with Discrete Cosine Transform.pdf:PDF},
  publisher      = {Springer Science and Business Media {LLC}},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
}

@Article{Zhang2022a,
  author      = {Zhang, Lei and Zhang, Jie and Lei, Bowen and Mukherjee, Subhabrata and Pan, Xiang and Zhao, Bo and Ding, Caiwen and Li, Yao and Xu, Dongkuan},
  date        = {2022-12-12},
  title       = {Accelerating Dataset Distillation via Model Augmentation},
  doi         = {10.48550/ARXIV.2212.06152},
  eprint      = {2212.06152},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhang2022a - Accelerating Dataset Distillation Via Model Augmentation.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@InProceedings{Davari2022,
  author    = {MohammadReza Davari and Stefan Horoi and Amine Natik and Guillaume Lajoie and Guy Wolf and Eugene Belilovsky},
  booktitle = {NeurIPS ML Safety Workshop},
  title     = {Deceiving the {CKA} Similarity Measure in Deep Learning},
  url       = {https://openreview.net/forum?id=hITONWhDIIJ},
  file      = {:Davari2022 - Deceiving the CKA Similarity Measure in Deep Learning.pdf:PDF},
  year      = {2022},
}

@InProceedings{Kolesnikov2020,
  author         = {Kolesnikov, Alexander and Beyer, Lucas and Zhai, Xiaohua and Puigcerver, Joan and Yung, Jessica and Gelly, Sylvain and Houlsby, Neil},
  booktitle      = {Computer Vision -- ECCV 2020},
  title          = {Big Transfer (BiT): General Visual Representation Learning},
  doi            = {10.48550/arxiv.1912.11370},
  eprint         = {1912.11370},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  isbn           = {978-3-030-58558-7},
  pages          = {491--507},
  publisher      = {Springer International Publishing},
  address        = {Cham},
  file           = {:Kolesnikov2020 - Big Transfer (BiT)_ General Visual Representation Learning.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2020},
}

@InProceedings{He2016,
  author      = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle   = {Proceedings of the IEEE conference on computer vision and pattern recognition},
  title       = {Deep Residual Learning for Image Recognition},
  doi         = {10.48550/arxiv.1512.03385},
  eprint      = {1512.03385},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  pages       = {770--778},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:He2016 - Deep Residual Learning for Image Recognition.pdf:PDF},
  year        = {2016},
}

@InProceedings{Martins2022,
  author         = {Martins, Pedro Henrique and Marinho, Zita and Martins, André F. T.},
  booktitle      = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  title          = {$\infty$-former: Infinite Memory Transformer},
  doi            = {10.18653/v1/2022.acl-long.375},
  eprint         = {2109.00301},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  pages          = {5468--5485},
  publisher      = {Association for Computational Linguistics},
  url            = {https://aclanthology.org/2022.acl-long.375},
  address        = {Dublin, Ireland},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Martins2022 - $$ Former_ Infinite Memory Transformer.pdf:PDF},
  month          = may,
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Misc{Mukherjee2023,
  author        = {Subhabrata Mukherjee and Arindam Mitra and Ganesh Jawahar and Sahaj Agarwal and Hamid Palangi and Ahmed Awadallah},
  title         = {Orca: Progressive Learning from Complex Explanation Traces of GPT-4},
  doi           = {10.48550/arxiv.2306.02707},
  eprint        = {2306.02707},
  archiveprefix = {arXiv},
  file          = {:Mukherjee2023 - Orca_ Progressive Learning from Complex Explanation Traces of GPT 4.pdf:PDF},
  primaryclass  = {cs.CL},
  year          = {2023},
}

@InProceedings{You2020,
  author      = {You, Yang and Li, Jing and Reddi, Sashank and Hseu, Jonathan and Kumar, Sanjiv and Bhojanapalli, Srinadh and Song, Xiaodan and Demmel, James and Keutzer, Kurt and Hsieh, Cho-Jui},
  booktitle   = {International Conference on Learning Representations},
  title       = {Large Batch Optimization for Deep Learning: Training BERT in 76 minutes},
  doi         = {10.48550/arxiv.1904.00962},
  eprint      = {1904.00962},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  url         = {https://openreview.net/forum?id=Syx4wnEtvH},
  file        = {:You2020 - Large Batch Optimization for Deep Learning_ Training BERT in 76 Minutes.pdf:PDF},
  year        = {2020},
}

@Article{Diwan2023,
  author         = {Diwan, Anuj and Choi, Eunsol and Harwath, David},
  date           = {2023-06-14},
  title          = {When to Use Efficient Self Attention? Profiling Text, Speech and Image Transformer Variants},
  doi            = {10.48550/ARXIV.2306.08667},
  eprint         = {2306.08667},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Diwan2023 - When to Use Efficient Self Attention_ Profiling Text, Speech and Image Transformer Variants.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Kurtz2023,
  author      = {Kurtz, Yoav and Bar, Noga and Giryes, Raja},
  date        = {2023-06-16},
  title       = {Group Orthogonalization Regularization For Vision Models Adaptation and Robustness},
  doi         = {10.48550/ARXIV.2306.10001},
  eprint      = {2306.10001},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Kurtz2023 - Group Orthogonalization Regularization for Vision Models Adaptation and Robustness.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Zhang2023a,
  author      = {Zhang, Jifan and Chen, Yifang and Canal, Gregory and Mussmann, Stephen and Zhu, Yinglun and Du, Simon Shaolei and Jamieson, Kevin and Nowak, Robert D},
  date        = {2023-06-16},
  title       = {LabelBench: A Comprehensive Framework for Benchmarking Label-Efficient Learning},
  doi         = {10.48550/ARXIV.2306.09910},
  eprint      = {2306.09910},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhang2023a - LabelBench_ a Comprehensive Framework for Benchmarking Label Efficient Learning.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Ding2023,
  author         = {Ding, Jiayu and Ma, Shuming and Dong, Li and Zhang, Xingxing and Huang, Shaohan and Wang, Wenhui and Wei, Furu},
  date           = {2023-07-05},
  title          = {LongNet: Scaling Transformers to 1,000,000,000 Tokens},
  doi            = {10.48550/ARXIV.2307.02486},
  eprint         = {2307.02486},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Ding2023 - LongNet_ Scaling Transformers to 1,000,000,000 Tokens.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2023},
}

@InProceedings{Tworkowski2023,
  author         = {Tworkowski, Szymon and Staniszewski, Konrad and Pacek, Mikołaj and Wu, Yuhuai and Michalewski, Henryk and Miłoś, Piotr},
  booktitle      = {Thirty-seventh Conference on Neural Information Processing Systems},
  title          = {Focused Transformer: Contrastive Training for Context Scaling},
  doi            = {10.48550/arxiv.2307.03170},
  eprint         = {2307.03170},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  publisher      = {arXiv},
  file           = {:Tworkowski2023 - Focused Transformer_ Contrastive Training for Context Scaling.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Vincent2015,
  author      = {Vincent, Pascal and de Brébisson, Alexandre and Bouthillier, Xavier},
  title       = {Efficient Exact Gradient Update for training Deep Networks with Very Large Sparse Targets},
  doi         = {10.48550/arxiv.1412.7091},
  eprint      = {1412.7091},
  eprintclass = {cs.NE},
  eprinttype  = {arXiv},
  volume      = {28},
  file        = {:Vincent2015 - Efficient Exact Gradient Update for Training Deep Networks with Very Large Sparse Targets.pdf:PDF},
  journal     = {Advances in Neural Information Processing Systems},
  keywords    = {Neural and Evolutionary Computing (cs.NE), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  year        = {2015},
}

@InProceedings{Liang2017,
  author    = {Liang, Xuezhi and Wang, Xiaobo and Lei, Zhen and Liao, Shengcai and Li, Stan Z.},
  booktitle = {Neural Information Processing},
  title     = {Soft-Margin Softmax for Deep Classification},
  editor    = {Liu, Derong and Xie, Shengli and Li, Yuanqing and Zhao, Dongbin and El-Alfy, El-Sayed M.},
  isbn      = {978-3-319-70096-0},
  pages     = {413--421},
  publisher = {Springer International Publishing},
  address   = {Cham},
  file      = {:Liang2017 - Soft Margin Softmax for Deep Classification.pdf:PDF},
  year      = {2017},
}

@Article{Banerjee2020,
  author     = {Kunal Banerjee and Vishak Prasad C. and Rishi Raj Gupta and Karthik Vyas and Anushree H. and Biswajit Mishra},
  title      = {Exploring Alternatives to Softmax Function},
  doi        = {10.48550/arxiv.2011.11538},
  eprint     = {2011.11538},
  eprinttype = {arXiv},
  volume     = {abs/2011.11538},
  copyright  = {Creative Commons Attribution Share Alike 4.0 International},
  file       = {:Banerjee2020 - Exploring Alternatives to Softmax Function.pdf:PDF},
  journal    = {CoRR},
  year       = {2020},
}

@InProceedings{Brebisson2016,
  author      = {Alexandre de Br{\'{e}}bisson and Pascal Vincent},
  booktitle   = {4th International Conference on Learning Representations, {ICLR} 2016, San Juan, Puerto Rico, May 2-4, 2016, Conference Track Proceedings},
  title       = {An Exploration of Softmax Alternatives Belonging to the Spherical Loss Family},
  doi         = {10.48550/arxiv.1511.05042},
  editor      = {Yoshua Bengio and Yann LeCun},
  eprint      = {1511.05042},
  eprintclass = {cs.NE},
  eprinttype  = {arXiv},
  file        = {:Brebisson2016 - An Exploration of Softmax Alternatives Belonging to the Spherical Loss Family.pdf:PDF},
  keywords    = {Neural and Evolutionary Computing (cs.NE), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  year        = {2016},
}

@InProceedings{Solodskikh2023,
  author         = {Solodskikh, Kirill and Kurbanov, Azim and Aydarkhanov, Ruslan and Zhelavskaya, Irina and Parfenov, Yury and Song, Dehua and Lefkimmiatis, Stamatios},
  booktitle      = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title          = {Integral Neural Networks},
  pages          = {16113-16122},
  url            = {https://openaccess.thecvf.com/content/CVPR2023/html/Solodskikh_Integral_Neural_Networks_CVPR_2023_paper.html},
  file           = {:Solodskikh2023 - Integral Neural Networks.pdf:PDF},
  month          = {June},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{He2023a,
  author         = {He, Muyang and Yang, Shuo and Huang, Tiejun and Zhao, Bo},
  date           = {2023-06-08},
  title          = {Large-scale Dataset Pruning with Dynamic Uncertainty},
  doi            = {10.48550/ARXIV.2306.05175},
  eprint         = {2306.05175},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file           = {:He2023a - Large Scale Dataset Pruning with Dynamic Uncertainty.pdf:PDF},
  groups         = {Coreset for FL},
  keywords       = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@InCollection{Paszke2019,
  author    = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
  booktitle = {Advances in Neural Information Processing Systems 32},
  title     = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
  pages     = {8024--8035},
  publisher = {Curran Associates, Inc.},
  year      = {2019},
}

@Misc{Wightman2019,
  author       = {Ross Wightman},
  title        = {PyTorch Image Models},
  doi          = {10.5281/zenodo.4414861},
  howpublished = {\url{https://github.com/rwightman/pytorch-image-models}},
  journal      = {GitHub repository},
  publisher    = {GitHub},
  year         = {2019},
}

@Article{Bertsch2023,
  author         = {Bertsch, Amanda and Alon, Uri and Neubig, Graham and Gormley, Matthew R.},
  date           = {2023-05-02},
  title          = {Unlimiformer: Long-Range Transformers with Unlimited Length Input},
  doi            = {10.48550/ARXIV.2305.01625},
  eprint         = {2305.01625},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Bertsch2023 - Unlimiformer_ Long Range Transformers with Unlimited Length Input.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Sun2023,
  author         = {Sun, Yutao and Dong, Li and Huang, Shaohan and Ma, Shuming and Xia, Yuqing and Xue, Jilong and Wang, Jianyong and Wei, Furu},
  date           = {2023-07-17},
  title          = {Retentive Network: A Successor to Transformer for Large Language Models},
  doi            = {10.48550/ARXIV.2307.08621},
  eprint         = {2307.08621},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Sun2023 - Retentive Network_ a Successor to Transformer for Large Language Models.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@InProceedings{Chopard2021,
  author    = {Daphn{\'{e}} Chopard and Matthias S. Treder and Irena Spasi{\'{c}}},
  booktitle = {Proceedings of the Second Workshop on Insights from Negative Results in {NLP}},
  date      = {2021},
  title     = {Learning Data Augmentation Schedules for Natural Language Processing},
  doi       = {10.18653/v1/2021.insights-1.14},
  publisher = {Association for Computational Linguistics},
  file      = {:Chopard2021 - Learning Data Augmentation Schedules for Natural Language Processing.pdf:PDF},
}

@Article{Wei2021,
  author      = {Wei, Jason and Huang, Chengyu and Vosoughi, Soroush and Cheng, Yu and Xu, Shiqi},
  date        = {2021-03-12},
  title       = {Few-Shot Text Classification with Triplet Networks, Data Augmentation, and Curriculum Learning},
  doi         = {10.18653/v1/2021.naacl-main.434},
  eprint      = {2103.07552},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  booktitle   = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Wei2021 - Few Shot Text Classification with Triplet Networks, Data Augmentation, and Curriculum Learning.pdf:PDF},
  keywords    = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  publisher   = {Association for Computational Linguistics},
  year        = {2021},
}

@InProceedings{Ye2021,
  author    = {Seonghyeon Ye and Jiseon Kim and Alice Oh},
  booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
  date      = {2021},
  title     = {Efficient Contrastive Learning via Novel Data Augmentation and Curriculum Learning},
  doi       = {10.18653/v1/2021.emnlp-main.138},
  publisher = {Association for Computational Linguistics},
  file      = {:Ye2021 - Efficient Contrastive Learning Via Novel Data Augmentation and Curriculum Learning.pdf:PDF},
}

@Article{Tan2021,
  author       = {Tan, Mingxing and Le, Quoc V.},
  date         = {2021-04-01},
  journaltitle = {International Conference on Machine Learning, 2021},
  title        = {EfficientNetV2: Smaller Models and Faster Training},
  doi          = {10.48550/ARXIV.2104.00298},
  eprint       = {2104.00298},
  eprintclass  = {cs.CV},
  eprinttype   = {arXiv},
  copyright    = {arXiv.org perpetual, non-exclusive license},
  file         = {:Tan2021 - EfficientNetV2_ Smaller Models and Faster Training.pdf:PDF},
  keywords     = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher    = {arXiv},
  year         = {2021},
}

@Article{Ahn2023,
  author      = {Ahn, Sumyeong and Ko, Jongwoo and Yun, Se-Young},
  date        = {2023-02-10},
  title       = {CUDA: Curriculum of Data Augmentation for Long-Tailed Recognition},
  doi         = {10.48550/ARXIV.2302.05499},
  eprint      = {2302.05499},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Ahn2023 - CUDA_ Curriculum of Data Augmentation for Long Tailed Recognition.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Chen2023a,
  author    = {Chen, Yongjie and Liu, Hongmin and Yin, Haoran and Fan, Bin},
  booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title     = {Building Vision Transformers with Hierarchy Aware Feature Aggregation},
  pages     = {5908-5918},
  file      = {:Chen2023a - Building Vision Transformers with Hierarchy Aware Feature Aggregation.pdf:PDF},
  month     = {October},
  year      = {2023},
}

@InProceedings{Psomas2023,
  author         = {Psomas, Bill and Kakogeorgiou, Ioannis and Karantzalos, Konstantinos and Avrithis, Yannis},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title          = {Keep It SimPool: Who Said Supervised Transformers Suffer from Attention Deficit?},
  pages          = {5350-5360},
  file           = {:Psomas2023 - Keep It SimPool_ Who Said Supervised Transformers Suffer from Attention Deficit_.pdf:PDF},
  month          = {October},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Darcet2023,
  author         = {Darcet, Timothée and Oquab, Maxime and Mairal, Julien and Bojanowski, Piotr},
  date           = {2023-09-28},
  title          = {Vision Transformers Need Registers},
  doi            = {10.48550/ARXIV.2309.16588},
  eprint         = {2309.16588},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  abstract       = {Transformers have recently emerged as a powerful tool for learning visual representations. In this paper, we identify and characterize artifacts in feature maps of both supervised and self-supervised ViT networks. The artifacts correspond to high-norm tokens appearing during inference primarily in low-informative background areas of images, that are repurposed for internal computations. We propose a simple yet effective solution based on providing additional tokens to the input sequence of the Vision Transformer to fill that role. We show that this solution fixes that problem entirely for both supervised and self-supervised models, sets a new state of the art for self-supervised visual models on dense visual prediction tasks, enables object discovery methods with larger models, and most importantly leads to smoother feature maps and attention maps for downstream visual processing.},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Darcet2023 - Vision Transformers Need Registers.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Chen2023b,
  author         = {Chen, Xiaohui and Wang, Yinkai and Du, Yuanqi and Hassoun, Soha and Liu, Li-Ping},
  date           = {2023-09-22},
  title          = {On Separate Normalization in Self-supervised Transformers},
  doi            = {10.48550/ARXIV.2309.12931},
  eprint         = {2309.12931},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Chen2023b - On Separate Normalization in Self Supervised Transformers.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Fan2023,
  author      = {Fan, Qihang and Huang, Huaibo and Chen, Mingrui and Liu, Hongmin and He, Ran},
  date        = {2023-09-20},
  title       = {RMT: Retentive Networks Meet Vision Transformers},
  doi         = {10.48550/ARXIV.2309.11523},
  eprint      = {2309.11523},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Fan2023 - RMT_ Retentive Networks Meet Vision Transformers.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Ma2023,
  author         = {Ma, Wenxuan and Li, Shuang and Zhang, JinMing and Liu, Chi Harold and Kang, Jingxuan and Wang, Yulin and Huang, Gao},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title          = {Borrowing Knowledge From Pre-trained Language Model: A New Data-efficient Visual Learning Paradigm},
  pages          = {18786-18797},
  file           = {:Ma2023 - Borrowing Knowledge from Pre Trained Language Model_ a New Data Efficient Visual Learning Paradigm.pdf:PDF},
  month          = {October},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Faiz2023,
  author      = {Faiz, Ahmad and Kaneda, Sotaro and Wang, Ruhan and Osi, Rita and Sharma, Parteek and Chen, Fan and Jiang, Lei},
  date        = {2023-09-25},
  title       = {LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language Models},
  doi         = {10.48550/ARXIV.2309.14393},
  eprint      = {2309.14393},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Faiz2023 - LLMCarbon_ Modeling the End to End Carbon Footprint of Large Language Models.pdf:PDF},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Computers and Society (cs.CY), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Zhou2023,
  author      = {Zhou, Chong and Loy, Chen Change and Dai, Bo},
  date        = {2023-09-19},
  title       = {Interpret Vision Transformers as ConvNets with Dynamic Convolutions},
  doi         = {10.48550/ARXIV.2309.10713},
  eprint      = {2309.10713},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhou2023 - Interpret Vision Transformers As ConvNets with Dynamic Convolutions.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Xu2023,
  author    = {Xu, Yixing and Li, Chao and Li, Dong and Sheng, Xiao and Jiang, Fan and Tian, Lu and Sirasao, Ashish},
  booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title     = {FDViT: Improve the Hierarchical Architecture of Vision Transformer},
  pages     = {5950-5960},
  file      = {:Xu2023 - FDViT_ Improve the Hierarchical Architecture of Vision Transformer.pdf:PDF},
  month     = {October},
  year      = {2023},
}

@InProceedings{Zhao2023,
  author         = {Zhao, Bingyin and Yu, Zhiding and Lan, Shiyi and Cheng, Yutao and Anandkumar, Anima and Lao, Yingjie and Alvarez, Jose M.},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title          = {Fully Attentional Networks with Self-emerging Token Labeling},
  pages          = {5585-5595},
  file           = {:Zhao2023 - Fully Attentional Networks with Self Emerging Token Labeling.pdf:PDF},
  month          = {October},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@InProceedings{Devlin2019,
  author      = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  booktitle   = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
  date        = {2018-10-11},
  title       = {BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
  doi         = {10.18653/v1/N19-1423},
  editor      = {Burstein, Jill and Doran, Christy and Solorio, Thamar},
  eprint      = {1810.04805},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  pages       = {4171--4186},
  publisher   = {Association for Computational Linguistics},
  address     = {Minneapolis, Minnesota},
  file        = {:Devlin2019 - BERT_ Pre Training of Deep Bidirectional Transformers for Language Understanding.pdf:PDF},
  keywords    = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  month       = jun,
  year        = {2019},
}

@Article{Dai2022,
  author      = {Dai, Jifeng and Shi, Min and Wang, Weiyun and Wu, Sitong and Xing, Linjie and Wang, Wenhai and Zhu, Xizhou and Lu, Lewei and Zhou, Jie and Wang, Xiaogang and Qiao, Yu and Hu, Xiaowei},
  date        = {2022-11-10},
  title       = {Demystify Transformers & Convolutions in Modern Image Deep Networks},
  doi         = {10.48550/ARXIV.2211.05781},
  eprint      = {2211.05781},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Dai2022 - Demystify Transformers & Convolutions in Modern Image Deep Networks.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@Article{Han2021,
  author  = {Han, Qi and Fan, Zejia and Dai, Qi and Sun, Lei and Cheng, Ming-Ming and Liu, Jiaying and Wang, Jingdong},
  title   = {Demystifying local vision transformer: Sparse connectivity, weight sharing, and dynamic weight},
  number  = {3},
  volume  = {2},
  file    = {:Han2021 - Demystifying Local Vision Transformer_ Sparse Connectivity, Weight Sharing, and Dynamic Weight.pdf:PDF},
  journal = {arXiv preprint arXiv:2106.04263},
  year    = {2021},
}

@Article{Bozic2023,
  author         = {Bozic, Vukasin and Dordevic, Danilo and Coppola, Daniele and Thommes, Joseph},
  title          = {Rethinking Attention: Exploring Shallow Feed-Forward Neural Networks as an Alternative to Attention Layers in Transformers},
  doi            = {10.48550/ARXIV.2311.10642},
  eprint         = {2311.10642},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file           = {:Bozic2023 - Rethinking Attention_ Exploring Shallow Feed Forward Neural Networks As an Alternative to Attention Layers in Transformers.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{He2023b,
  author      = {He, Bobby and Hofmann, Thomas},
  date        = {2023-11-03},
  title       = {Simplifying Transformer Blocks},
  doi         = {10.48550/ARXIV.2311.01906},
  eprint      = {2311.01906},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:He2023b - Simplifying Transformer Blocks.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Zaheer2020,
  author    = {Zaheer, Manzil and Guruganesh, Guru and Dubey, Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and Ahmed, Amr},
  booktitle = {Proceedings of the 34th International Conference on Neural Information Processing Systems},
  title     = {Big Bird: Transformers for Longer Sequences},
  isbn      = {9781713829546},
  location  = {Vancouver, BC, Canada},
  publisher = {Curran Associates Inc.},
  series    = {NIPS'20},
  address   = {Red Hook, NY, USA},
  articleno = {1450},
  file      = {:Zaheer2020 - Big Bird_ Transformers for Longer Sequences.pdf:PDF},
  numpages  = {15},
  year      = {2020},
}

@Article{Child2019,
  author      = {Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya},
  date        = {2019-04-23},
  title       = {Generating Long Sequences with Sparse Transformers},
  doi         = {10.48550/ARXIV.1904.10509},
  eprint      = {1904.10509},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Child2019 - Generating Long Sequences with Sparse Transformers.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2019},
}

@InProceedings{Qiu2023,
  author         = {Qiu, Yuwei and Zhang, Kaihao and Wang, Chenxi and Luo, Wenhan and Li, Hongdong and Jin, Zhi},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title          = {MB-TaylorFormer: Multi-Branch Efficient Transformer Expanded by Taylor Formula for Image Dehazing},
  pages          = {12802-12813},
  file           = {:Qiu2023 - MB TaylorFormer_ Multi Branch Efficient Transformer Expanded by Taylor Formula for Image Dehazing.pdf:PDF},
  month          = {October},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2023},
}

@InProceedings{Iwana2019,
  author      = {Brian Kenji Iwana and Ryohei Kuroki and Seiichi Uchida},
  booktitle   = {Proceedings - 2019 International Conference on Computer Vision Workshop, ICCVW 2019},
  title       = {Explaining Convolutional Neural Networks using Softmax Gradient Layer-wise Relevance Propagation},
  doi         = {10.1109/ICCVW.2019.00513},
  eprint      = {1908.04351},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  language    = {English},
  pages       = {4176--4185},
  publisher   = {Institute of Electrical and Electronics Engineers Inc.},
  series      = {Proceedings - 2019 International Conference on Computer Vision Workshop, ICCVW 2019},
  address     = {United States},
  file        = {:Iwana2019 - Explaining Convolutional Neural Networks Using Softmax Gradient Layer Wise Relevance Propagation.pdf:PDF},
  keywords    = {cs.CV, cs.LG, cs.NE},
  month       = oct,
  year        = {2019},
}

@Article{Montavon2015,
  author       = {Montavon, Grégoire and Bach, Sebastian and Binder, Alexander and Samek, Wojciech and Müller, Klaus-Robert},
  date         = {2015-12-08},
  journaltitle = {Pattern Recognition},
  title        = {Explaining NonLinear Classification Decisions with Deep Taylor Decomposition},
  doi          = {10.1016/j.patcog.2016.11.008},
  eprint       = {1512.02479},
  eprintclass  = {cs.LG},
  eprinttype   = {arXiv},
  issn         = {0031-3203},
  pages        = {211--222},
  volume       = {65},
  file         = {:Montavon2015 - Explaining NonLinear Classification Decisions with Deep Taylor Decomposition.pdf:PDF},
  journal      = {Pattern Recognition},
  keywords     = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  month        = may,
  publisher    = {Elsevier BV},
  year         = {2015},
}


@InProceedings{Wang2019,
  author    = {Wang, Xiaodi and Li, Ce and Mou, Yipeng and Zhang, Baochang and Han, Jungong and Liu, Jianzhuang},
  booktitle = {2019 IEEE Winter Conference on Applications of Computer Vision (WACV)},
  date      = {2019-01},
  title     = {Taylor Convolutional Networks for Image Classification},
  doi       = {10.1109/wacv.2019.00140},
  publisher = {IEEE},
  file      = {:Wang2019 - Taylor Convolutional Networks for Image Classification.pdf:PDF},
}

@Misc{Zhao2023a,
  author = {Hongjue Zhao and Yizhuo Chen and Dachun Sun and Yingdong Hu and Kaizhao Liang and Yanbing Mao and Lui Sha and Huajie Shao},
  title  = {TaylorNet: A Taylor-Driven Generic Neural Architecture},
  file   = {:Zhao2023a - TaylorNet_ a Taylor Driven Generic Neural Architecture.pdf:PDF},
  year   = {2023},
}


@Article{Xing2020,
  author       = {Xing, Changda and Wang, Meiling and Dong, Chong and Duan, Chaowei and Wang, Zhisheng},
  date         = {2020-08},
  journaltitle = {Neurocomputing},
  title        = {Using Taylor Expansion and Convolutional Sparse Representation for Image Fusion},
  doi          = {10.1016/j.neucom.2020.04.002},
  issn         = {0925-2312},
  pages        = {437--455},
  volume       = {402},
  file         = {:Xing2020 - Using Taylor Expansion and Convolutional Sparse Representation for Image Fusion.pdf:PDF},
  journal      = {Neurocomputing},
  publisher    = {Elsevier BV},
  year         = {2020},
}


@InProceedings{Gaikwad2018,
  author    = {Gaikwad, Akash Sunil and El-Sharkawy, Mohamed},
  booktitle = {2018 IEEE International Symposium on Signal Processing and Information Technology (ISSPIT)},
  date      = {2018-12},
  title     = {Pruning convolution neural network (squeezenet) using taylor expansion-based criterion},
  doi       = {10.1109/isspit.2018.8705095},
  publisher = {IEEE},
  file      = {:Gaikwad2018 - Pruning Convolution Neural Network (squeezenet) Using Taylor Expansion Based Criterion.pdf:PDF},
  year      = {2018},
}

@InProceedings{Molchanov2017,
  author         = {Pavlo Molchanov and Stephen Tyree and Tero Karras and Timo Aila and Jan Kautz},
  booktitle      = {International Conference on Learning Representations},
  title          = {Pruning Convolutional Neural Networks for Resource Efficient Inference},
  doi            = {10.48550/arxiv.1611.06440},
  eprint         = {1611.06440},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  file           = {:Molchanov2017 - Pruning Convolutional Neural Networks for Resource Efficient Inference.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2017},
}

@InProceedings{Schaeffer2023,
  author      = {Rylan Schaeffer and Brando Miranda and Sanmi Koyejo},
  booktitle   = {Thirty-seventh Conference on Neural Information Processing Systems},
  title       = {Are Emergent Abilities of Large Language Models a Mirage?},
  doi         = {10.48550/arxiv.2304.15004},
  eprint      = {2304.15004},
  eprintclass = {cs.AI},
  eprinttype  = {arXiv},
  url         = {https://openreview.net/forum?id=ITw9edRDlD},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Schaeffer2023 - Are Emergent Abilities of Large Language Models a Mirage_.pdf:PDF},
  keywords    = {Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  year        = {2023},
}

@InProceedings{Nivron2023,
  author      = {Omer Nivron and Raghul Parthipan and Damon Wischik},
  booktitle   = {ICML Workshop on New Frontiers in Learning, Control, and Dynamical Systems},
  title       = {Taylorformer: Probabalistic Modelling for Random Processes including Time Series},
  doi         = {10.48550/arxiv.2305.19141},
  eprint      = {2305.19141},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Nivron2023 - Taylorformer_ Probabalistic Modelling for Random Processes Including Time Series.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  year        = {2023},
}

@InProceedings{Nauen2025,
  author        = {Tobias Christian Nauen and Sebastian Palacio and Andreas Dengel},
  booktitle     = {Proceedings of the Winter Conference on Applications of Computer Vision (WACV)},
  title         = {Which Transformer to Favor: A Comparative Analysis of Efficiency in Vision Transformers},
  eprint        = {2308.09372},
  pages         = {6955-6966},
  archiveprefix = {arXiv},
  month         = {February},
  primaryclass  = {cs.CV},
  year          = {2025},
}

@Article{Bulatov2023,
  author      = {Bulatov, Aydar and Kuratov, Yuri and Burtsev, Mikhail S.},
  date        = {2023-04-19},
  title       = {Scaling Transformer to 1M tokens and beyond with RMT},
  doi         = {10.48550/ARXIV.2304.11062},
  eprint      = {2304.11062},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:Bulatov2023 - Scaling Transformer to 1M Tokens and beyond with RMT.pdf:PDF},
  year        = {2023},
}

@InProceedings{Gu2022,
  author         = {Albert Gu and Karan Goel and Christopher Re},
  booktitle      = {International Conference on Learning Representations},
  title          = {Efficiently Modeling Long Sequences with Structured State Spaces},
  doi            = {10.48550/arxiv.2111.00396},
  eprint         = {2111.00396},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  file           = {:Gu2022 - Efficiently Modeling Long Sequences with Structured State Spaces.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Maas2011,
  author    = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  title     = {Learning Word Vectors for Sentiment Analysis},
  editor    = {Lin, Dekang and Matsumoto, Yuji and Mihalcea, Rada},
  pages     = {142--150},
  publisher = {Association for Computational Linguistics},
  address   = {Portland, Oregon, USA},
  file      = {:Maas2011 - Learning Word Vectors for Sentiment Analysis.pdf:PDF},
  month     = {June},
  year      = {2011},
}

@InProceedings{Dass2023,
  author    = {J. Dass and S. Wu and H. Shi and C. Li and Z. Ye and Z. Wang and Y. Lin},
  booktitle = {2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
  title     = {ViTALiTy: Unifying Low-rank and Sparse Approximation for Vision Transformer Acceleration with a Linear Taylor Attention},
  doi       = {10.1109/HPCA56546.2023.10071081},
  pages     = {415-428},
  publisher = {IEEE Computer Society},
  address   = {Los Alamitos, CA, USA},
  keywords  = {training;costs;systematics;approximation algorithms;transformers;boosting;sparse representation},
  month     = {mar},
  year      = {2023},
}

@InProceedings{Nangia2018,
  author    = {Nangia, Nikita and Bowman, Samuel},
  booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Student Research Workshop},
  title     = {{L}ist{O}ps: A Diagnostic Dataset for Latent Tree Learning},
  doi       = {10.18653/v1/N18-4013},
  editor    = {Cordeiro, Silvio Ricardo and Oraby, Shereen and Pavalanathan, Umashanthi and Rim, Kyeongmin},
  pages     = {92--99},
  publisher = {Association for Computational Linguistics},
  address   = {New Orleans, Louisiana, USA},
  month     = jun,
  year      = {2018},
}

@Misc{Chen2024,
  author     = {Chen, Hongzhan and Quan, Xiaojun and Chen, Hehong and Yan, Ming and Zhang, Ji},
  date       = {2024},
  title      = {Knowledge Distillation for Closed-Source Language Models},
  doi        = {10.48550/ARXIV.2401.07013},
  eprint     = {2401.07013},
  eprinttype = {arxiv},
  file       = {:Chen2024 - Knowledge Distillation for Closed Source Language Models.pdf:PDF},
  keywords   = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
}

@Misc{ElNouby2024,
  author     = {El-Nouby, Alaaeldin and Klein, Michal and Zhai, Shuangfei and Bautista, Miguel Angel and Toshev, Alexander and Shankar, Vaishaal and Susskind, Joshua M and Joulin, Armand},
  date       = {2024},
  title      = {Scalable Pre-training of Large Autoregressive Image Models},
  doi        = {10.48550/ARXIV.2401.08541},
  eprint     = {2401.08541},
  eprinttype = {arxiv},
  file       = {:ElNouby2024 - Scalable Pre Training of Large Autoregressive Image Models.pdf:PDF},
  keywords   = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  priority   = {prio3},
}

@Misc{Heddes2024,
  author         = {Heddes, Mike and Srinivasa, Narayan and Givargis, Tony and Nicolau, Alexandru},
  date           = {2024},
  title          = {Always-Sparse Training by Growing Connections with Guided Stochastic Exploration},
  doi            = {10.48550/ARXIV.2401.06898},
  eprint         = {2401.06898},
  eprinttype     = {arxiv},
  file           = {:Heddes2024 - Always Sparse Training by Growing Connections with Guided Stochastic Exploration.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
}

@Misc{Li2024,
  author  = {Y Li, Y Lei, X Yang},
  date    = {2024},
  title   = {Spikeformer: Training high-performance spiking neural network with transformer},
  url     = {https://www.sciencedirect.com/science/article/pii/S092523122400050X},
  urldate = {2024-02-05},
}

@Misc{Owen2024,
  author     = {Owen, David},
  date       = {2024},
  title      = {How predictable is language model benchmark performance?},
  doi        = {10.48550/ARXIV.2401.04757},
  eprint     = {2401.04757},
  eprinttype = {arxiv},
  file       = {:Owen2024 - How Predictable Is Language Model Benchmark Performance_.pdf:PDF},
  keywords   = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},
  priority   = {prio3},
}

@Misc{Zhou2024,
  author     = {Zhou, Xingyu and Zhang, Leheng and Zhao, Xiaorui and Wang, Keze and Li, Leida and Gu, Shuhang},
  date       = {2024},
  title      = {Video Super-Resolution Transformer with Masked Inter \& Intra-Frame Attention},
  doi        = {10.48550/ARXIV.2401.06312},
  eprint     = {2401.06312},
  eprinttype = {arxiv},
  file       = {:Zhou2024 - Video Super Resolution Transformer with Masked Inter & Intra Frame Attention.pdf:PDF},
  keywords   = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
}

@Misc{Godey2024,
  author         = {Godey, Nathan and de la Clergerie, Éric and Sagot, Benoît},
  date           = {2024},
  title          = {Anisotropy Is Inherent to Self-Attention in Transformers},
  doi            = {10.48550/ARXIV.2401.12143},
  eprint         = {2401.12143},
  eprinttype     = {arxiv},
  file           = {:Godey2024 - Anisotropy Is Inherent to Self Attention in Transformers.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
}

@Misc{Li2024a,
  author     = {Li, Mukai and Li, Lei and Yin, Yuwei and Ahmed, Masood and Liu, Zhenguang and Liu, Qi},
  date       = {2024},
  title      = {Red Teaming Visual Language Models},
  doi        = {10.48550/ARXIV.2401.12915},
  eprint     = {2401.12915},
  eprinttype = {arxiv},
  file       = {:Li2024a - Red Teaming Visual Language Models.pdf:PDF},
  keywords   = {Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
}

@Misc{Zhang2024,
  author         = {Zhang, Yiyuan and Ding, Xiaohan and Gong, Kaixiong and Ge, Yixiao and Shan, Ying and Yue, Xiangyu},
  date           = {2024},
  title          = {Multimodal Pathway: Improve Transformers with Irrelevant Data from Other Modalities},
  doi            = {10.48550/ARXIV.2401.14405},
  eprint         = {2401.14405},
  eprinttype     = {arxiv},
  file           = {:Zhang2024 - Multimodal Pathway_ Improve Transformers with Irrelevant Data from Other Modalities.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
}

@Misc{Lee2024,
  author         = {Lee, Seungho and Kang, Seoungyoon and Shim, Hyunjung},
  date           = {2024},
  title          = {Self-Supervised Vision Transformers Are Efficient Segmentation Learners for Imperfect Labels},
  doi            = {10.48550/ARXIV.2401.12535},
  eprint         = {2401.12535},
  eprinttype     = {arxiv},
  file           = {:Lee2024 - Self Supervised Vision Transformers Are Efficient Segmentation Learners for Imperfect Labels.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
}

@InProceedings{Alayrac2022,
  author       = {Jean-Baptiste Alayrac and Jeff Donahue and Pauline Luc and Antoine Miech and Iain Barr and Yana Hasson and Karel Lenc and Arthur Mensch and Katherine Millican and Malcolm Reynolds and Roman Ring and Eliza Rutherford and Serkan Cabi and Tengda Han and Zhitao Gong and Sina Samangooei and Marianne Monteiro and Jacob Menick and Sebastian Borgeaud and Andrew Brock and Aida Nematzadeh and Sahand Sharifzadeh and Mikolaj Binkowski and Ricardo Barreira and Oriol Vinyals and Andrew Zisserman and Karen Simonyan},
  booktitle    = {Advances in Neural Information Processing Systems},
  title        = {Flamingo: a Visual Language Model for Few-Shot Learning},
  editor       = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho},
  url          = {https://openreview.net/forum?id=EbMuimAbPbs},
  creationdate = {2024-02-06T15:02:51},
  file         = {:Alayrac2022 - Flamingo_ a Visual Language Model for Few Shot Learning.pdf:PDF},
  year         = {2022},
}

@InProceedings{Krause2013,
  author    = {Jonathan Krause and Michael Stark and Jia Deng and Li Fei-Fei},
  booktitle = {4th International IEEE Workshop on 3D Representation and Recognition (3dRR-13)},
  title     = {3D Object Representations for Fine-Grained Categorization},
  address   = {Sydney, Australia},
  comment   = {Stanford Cars Dataset},
  year      = {2013},
}

@InProceedings{Nilsback2008,
  author    = {Maria-Elena Nilsback and Andrew Zisserman},
  booktitle = {Indian Conference on Computer Vision, Graphics and Image Processing},
  title     = {Automated Flower Classification over a Large Number of Classes},
  comment   = {Oxford Flowers 102 Dataset},
  month     = {Dec},
  year      = {2008},
}

@InProceedings{Zhou2014,
  author       = {Zhou, Bolei and Lapedriza, Agata and Xiao, Jianxiong and Torralba, Antonio and Oliva, Aude},
  booktitle    = {Advances in Neural Information Processing Systems},
  date         = {2014},
  title        = {Learning Deep Features for Scene Recognition using Places Database},
  editor       = {Z. Ghahramani and M. Welling and C. Cortes and N. Lawrence and K.Q. Weinberger},
  publisher    = {Curran Associates, Inc.},
  url          = {https://proceedings.neurips.cc/paper_files/paper/2014/file/3fe94a002317b5f9259f82690aeea4cd-Paper.pdf},
  volume       = {27},
  comment      = {MIT Places365 Dataset},
  creationdate = {2024-02-08T11:29:18},
  year         = {2014},
}

@InProceedings{Wang2020a,
  author    = {Wang, Hanrui and Wu, Zhanghao and Liu, Zhijian and Cai, Han and Zhu, Ligeng and Gan, Chuang and Han, Song},
  booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
  title     = {{HAT}: Hardware-Aware Transformers for Efficient Natural Language Processing},
  doi       = {10.18653/v1/2020.acl-main.686},
  editor    = {Jurafsky, Dan and Chai, Joyce and Schluter, Natalie and Tetreault, Joel},
  pages     = {7675--7688},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2020.acl-main.686},
  address   = {Online},
  file      = {:Wang2020a - HAT_ Hardware Aware Transformers for Efficient Natural Language Processing.pdf:PDF},
  month     = jul,
  year      = {2020},
}

@InProceedings{Tabani2021,
  author    = {Tabani, Hamid and Balasubramaniam, Ajay and Marzban, Shabbir and Arani, Elahe and Zonooz, Bahram},
  booktitle = {2021 24th Euromicro Conference on Digital System Design (DSD)},
  title     = {Improving the Efficiency of Transformers for Resource-Constrained Devices},
  doi       = {10.1109/DSD53832.2021.00074},
  pages     = {449-456},
  file      = {:Tabani2021 - Improving the Efficiency of Transformers for Resource Constrained Devices.pdf:PDF},
  keywords  = {Performance evaluation;Computer vision;Digital systems;Memory management;Transformers;Data transfer;Natural language processing;Deep Learning;Transformers;Clustering;Resource-Constrained Devices},
  priority  = {prio3},
  year      = {2021},
}

@Misc{Torpey2024,
  author       = {Torpey, David and Klein, Richard},
  date         = {2024},
  title        = {Affine transformation estimation improves visual self-supervised learning},
  doi          = {10.48550/ARXIV.2402.09071},
  eprint       = {2402.09071},
  eprinttype   = {arxiv},
  creationdate = {2024-02-19T14:49:01},
  file         = {:auto/Torpey2024 - Affine_transformation_estimation_improves_visual_self-supervised_learning.pdf:PDF},
  keywords     = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
}

@Article{Wu2018,
  author         = {Wu, Shuang and Li, Guoqi and Deng, Lei and Liu, Liu and Xie, Yuan and Shi, Luping},
  date           = {2018-02-27},
  journaltitle   = {IEEE Transactions on Neural Networks and Learning Systems},
  title          = {L1-Norm Batch Normalization for Efficient Training of Deep Neural Networks},
  doi            = {10.1109/tnnls.2018.2876179},
  eprint         = {1802.09769},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  issn           = {2162-2388},
  number         = {7},
  pages          = {2043--2051},
  volume         = {30},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Wu2018 - L1 Norm Batch Normalization for Efficient Training of Deep Neural Networks.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  month          = jul,
  publisher      = {Institute of Electrical and Electronics Engineers (IEEE)},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2018},
}

@Article{Arora2024,
  author      = {Arora, Simran and Eyuboglu, Sabri and Zhang, Michael and Timalsina, Aman and Alberti, Silas and Zinsley, Dylan and Zou, James and Rudra, Atri and Ré, Christopher},
  date        = {2024-02-28},
  title       = {Simple linear attention language models balance the recall-throughput tradeoff},
  doi         = {10.48550/ARXIV.2402.18668},
  eprint      = {2402.18668},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Zero v1.0 Universal},
  file        = {:Arora2024 - Simple Linear Attention Language Models Balance the Recall Throughput Tradeoff.pdf:PDF},
  keywords    = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@InProceedings{Nauen2024,
  author    = {Nauen, Tobias Christian and Palacio, Sebastian and Dengel, Andreas},
  booktitle = {Pattern Recognition},
  title     = {TaylorShift: Shifting the Complexity of Self-attention from Squared to Linear (and Back) Using Taylor-Softmax},
  doi       = {10.1007/978-3-031-78172-8_1},
  editor    = {Antonacopoulos, Apostolos and Chaudhuri, Subhasis and Chellappa, Rama and Liu, Cheng-Lin and Bhattacharya, Saumik and Pal, Umapada},
  isbn      = {978-3-031-78172-8},
  note      = {ICPR 2024 (oral)},
  pages     = {1--16},
  publisher = {Springer Nature Switzerland},
  address   = {Cham},
  file      = {:auto/Nauen2024 - TaylorShift__Shifting_the_Complexity_of_Self-Attention_from_Squared_to_Linear_(and_Back)_using_Taylor-Softmax.pdf:PDF},
  year      = {2024},
}

@Article{Yang2024,
  author         = {Yang, Kai and Ackermann, Jan and He, Zhenyu and Feng, Guhao and Zhang, Bohang and Feng, Yunzhen and Ye, Qiwei and He, Di and Wang, Liwei},
  date           = {2024-02-21},
  title          = {Do Efficient Transformers Really Save Computation?},
  doi            = {10.48550/ARXIV.2402.13934},
  eprint         = {2402.13934},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  creationdate   = {2024-03-11T14:19:39},
  file           = {:Yang2024 - Do Efficient Transformers Really Save Computation_.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Zuo2022,
  author         = {Shuangquan Zuo and Yun Xiao and Xiaojun Chang and Xuanhong Wang},
  date           = {2022-10},
  journaltitle   = {Knowledge-Based Systems},
  title          = {Vision transformers for dense prediction: A survey},
  doi            = {10.1016/j.knosys.2022.109552},
  issn           = {0950-7051},
  pages          = {109552},
  url            = {https://www.sciencedirect.com/science/article/pii/S0950705122007821},
  volume         = {253},
  creationdate   = {2024-03-11T16:43:35},
  file           = {:Zuo2022 - Vision Transformers for Dense Prediction_ a Survey.pdf:PDF},
  journal        = {Knowledge-Based Systems},
  keywords       = {Deep learning, Transformer, Dense prediction, Computer vision},
  publisher      = {Elsevier {BV}},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2022},
}

@Article{Strudel2021,
  author         = {Strudel, Robin and Garcia, Ricardo and Laptev, Ivan and Schmid, Cordelia},
  date           = {2021-05-12},
  title          = {Segmenter: Transformer for Semantic Segmentation},
  doi            = {10.48550/ARXIV.2105.05633},
  eprint         = {2105.05633},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  creationdate   = {2024-03-12T16:21:00},
  file           = {:Strudel2021 - Segmenter_ Transformer for Semantic Segmentation.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Chen2022a,
  author       = {Chen, Zhe and Duan, Yuchen and Wang, Wenhai and He, Junjun and Lu, Tong and Dai, Jifeng and Qiao, Yu},
  date         = {2022-05-17},
  title        = {Vision Transformer Adapter for Dense Predictions},
  doi          = {10.48550/ARXIV.2205.08534},
  eprint       = {2205.08534},
  eprintclass  = {cs.CV},
  eprinttype   = {arXiv},
  abstract     = {This work investigates a simple yet powerful dense prediction task adapter for Vision Transformer (ViT). Unlike recently advanced variants that incorporate vision-specific inductive biases into their architectures, the plain ViT suffers inferior performance on dense predictions due to weak prior assumptions. To address this issue, we propose the ViT-Adapter, which allows plain ViT to achieve comparable performance to vision-specific transformers. Specifically, the backbone in our framework is a plain ViT that can learn powerful representations from large-scale multi-modal data. When transferring to downstream tasks, a pre-training-free adapter is used to introduce the image-related inductive biases into the model, making it suitable for these tasks. We verify ViT-Adapter on multiple dense prediction tasks, including object detection, instance segmentation, and semantic segmentation. Notably, without using extra detection data, our ViT-Adapter-L yields state-of-the-art 60.9 box AP and 53.0 mask AP on COCO test-dev. We hope that the ViT-Adapter could serve as an alternative for vision-specific transformers and facilitate future research. The code and models will be released at https://github.com/czczup/ViT-Adapter.},
  copyright    = {arXiv.org perpetual, non-exclusive license},
  creationdate = {2024-03-12T16:47:07},
  file         = {:Chen2022a - Vision Transformer Adapter for Dense Predictions.pdf:PDF},
  keywords     = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority     = {prio3},
  publisher    = {arXiv},
  year         = {2022},
}

@Article{Teney2024,
  author         = {Damien Teney and Armand Nicolicioiu and Valentin Hartmann and Ehsan Abbasnejad},
  date           = {2024-03-04},
  journaltitle   = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2024},
  title          = {Neural Redshift: Random Networks are not Random Functions},
  eprint         = {2403.02241},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  creationdate   = {2024-03-13T11:44:28},
  file           = {:Teney2024 - Neural Redshift_ Random Networks Are Not Random Functions.pdf:PDF},
  keywords       = {cs.LG, cs.AI, cs.CV},
  qualityassured = {qualityAssured},
  readstatus     = {read},
}

@InProceedings{Zheng2023,
  author         = {Lin Zheng and Jianbo Yuan and Chong Wang and Lingpeng Kong},
  booktitle      = {The Eleventh International Conference on Learning Representations},
  title          = {Efficient Attention via Control Variates},
  doi            = {10.48550/ARXIV.2302.04542},
  eprint         = {2302.04542},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  file           = {:Zheng2023 - Efficient Attention Via Control Variates.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@InProceedings{Peng2021a,
  author         = {Peng, Hao and Pappas, Nikolaos and Yogatama, Dani and Schwartz, Roy and Smith, Noah A. and Kong, Lingpeng},
  booktitle      = {International Conference on Learning Representations},
  date           = {2021-03-03},
  title          = {Random Feature Attention},
  eprint         = {2103.02143},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  file           = {:Peng2021a - Random Feature Attention.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Xiong2022,
  author         = {Xiong, Wenhan and Oguz, Barlas and Gupta, Anchit and Chen, Xilun and Liskovich, Diana and Levy, Omer and Yih, Scott and Mehdad, Yashar},
  booktitle      = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
  title          = {Simple Local Attentions Remain Competitive for Long-Context Tasks},
  doi            = {10.18653/v1/2022.naacl-main.144},
  editor         = {Carpuat, Marine and de Marneffe, Marie-Catherine and Meza Ruiz, Ivan Vladimir},
  eprint         = {2112.07210},
  eprintclass    = {cs.CL},
  eprinttype     = {arXiv},
  pages          = {1975--1986},
  publisher      = {Association for Computational Linguistics},
  url            = {https://aclanthology.org/2022.naacl-main.144},
  address        = {Seattle, United States},
  file           = {:Xiong2021a - Simple Local Attentions Remain Competitive for Long Context Tasks.pdf:PDF},
  keywords       = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  month          = jul,
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Zhang2023b,
  author         = {Zhang, Jun and Jiang, Shuyang and Feng, Jiangtao and Zheng, Lin and Kong, Lingpeng},
  booktitle      = {International Conference on Machine Learning},
  title          = {CAB: Comprehensive Attention Benchmarking on Long Sequence Modeling},
  eprint         = {2210.07661},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  organization   = {PMLR},
  pages          = {41194--41218},
  file           = {:Zhang2023b - CAB_ Comprehensive Attention Benchmarking on Long Sequence Modeling.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Kitaev2020,
  author      = {Kitaev, Nikita and Kaiser, Lukasz and Levskaya, Anselm},
  date        = {2020-01-13},
  title       = {Reformer: The Efficient Transformer},
  doi         = {10.48550/ARXIV.2001.04451},
  eprint      = {2001.04451},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  abstract    = {Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its complexity from O($L^2$) to O($L\log L$), where $L$ is the length of the sequence. Furthermore, we use reversible residual layers instead of the standard residuals, which allows storing activations only once in the training process instead of $N$ times, where $N$ is the number of layers. The resulting model, the Reformer, performs on par with Transformer models while being much more memory-efficient and much faster on long sequences.},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Kitaev2020 - Reformer_ the Efficient Transformer.pdf:PDF},
  keywords    = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2020},
}

@InProceedings{Radford2019,
  author = {Alec Radford and Jeff Wu and Rewon Child and David Luan and Dario Amodei and Ilya Sutskever},
  title  = {Language Models are Unsupervised Multitask Learners},
  url    = {https://api.semanticscholar.org/CorpusID:160025533},
  year   = {2019},
}

@Article{Li2023c,
  author      = {Li, Xiangtai and Ding, Henghui and Yuan, Haobo and Zhang, Wenwei and Pang, Jiangmiao and Cheng, Guangliang and Chen, Kai and Liu, Ziwei and Loy, Chen Change},
  date        = {2023-04-19},
  title       = {Transformer-Based Visual Segmentation: A Survey},
  doi         = {10.48550/ARXIV.2304.09854},
  eprint      = {2304.09854},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Li2023c - Transformer Based Visual Segmentation_ a Survey.pdf:PDF:http\://arxiv.org/pdf/2304.09854v3},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Quetu2023,
  author         = {Quétu, Victor and Milovanovic, Marta and Tartaglione, Enzo},
  date           = {2023-07-26},
  title          = {Sparse Double Descent in Vision Transformers: real or phantom threat?},
  doi            = {10.1007/978-3-031-43153-1_41},
  eprint         = {2307.14253},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  issn           = {1611-3349},
  pages          = {490--502},
  booktitle      = {Lecture Notes in Computer Science},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Quetu2023 - Sparse Double Descent in Vision Transformers_ Real or Phantom Threat_.pdf:PDF:http\://arxiv.org/pdf/2307.14253v1},
  isbn           = {9783031431531},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {Springer Nature Switzerland},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Frumkin2023,
  author         = {Frumkin, Natalia and Gope, Dibakar and Marculescu, Diana},
  date           = {2023-08-21},
  title          = {Jumping through Local Minima: Quantization in the Loss Landscape of Vision Transformers},
  doi            = {10.48550/ARXIV.2308.10814},
  eprint         = {2308.10814},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Share Alike 4.0 International},
  file           = {:Frumkin2023 - Jumping through Local Minima_ Quantization in the Loss Landscape of Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2308.10814v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Liu2021a,
  author         = {Liu, Yahui and Sangineto, Enver and Bi, Wei and Sebe, Nicu and Lepri, Bruno and De Nadai, Marco},
  date           = {2021-06-07},
  journaltitle   = {Proceedings of the 35th Conference on Neural Information Processing Systems (NeurIPS) 2021},
  title          = {Efficient Training of Visual Transformers with Small Datasets},
  doi            = {10.48550/ARXIV.2106.03746},
  eprint         = {2106.03746},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Liu2021a - Efficient Training of Visual Transformers with Small Datasets.pdf:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority       = {prio2},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  year           = {2021},
}

@Misc{Yang2024a,
  author         = {Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
  date           = {2024-01-19},
  title          = {Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data},
  doi            = {10.48550/ARXIV.2401.10891},
  eprint         = {2401.10891},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  abstract       = {1. Introduction},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:http\://arxiv.org/pdf/2401.10891v2:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  number         = {zero-/few-},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Misc{Aich2024,
  author       = {Aich, Abhishek and Suh, Yumin and Schulter, Samuel and Chandraker, Manmohan},
  date         = {2024},
  title        = {Progressive Token Length Scaling in Transformer Encoders for Efficient Universal Segmentation},
  doi          = {10.48550/ARXIV.2404.14657},
  url          = {https://arxiv.org/abs/2404.14657},
  urldate      = {2024-04-25},
  creationdate = {2024-04-25},
  file         = {:auto/Aich2024 - Progressive_Token_Length_Scaling_in_Transformer_Encoders_for_Efficient_Universal_Segmentation.pdf:PDF},
  keywords     = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
}

@Article{Bafghi2024,
  author         = {Bafghi, Reza Akbarian and Harilal, Nidhin and Monteleoni, Claire and Raissi, Maziar},
  date           = {2024-04-26},
  title          = {Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting},
  doi            = {10.48550/ARXIV.2404.17245},
  eprint         = {2404.17245},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Share Alike 4.0 International},
  file           = {:Bafghi2024 - Parameter Efficient Fine Tuning of Self Supervised ViTs without Catastrophic Forgetting.pdf:PDF:http\://arxiv.org/pdf/2404.17245v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@InProceedings{Zhu2021,
  author         = {Zhu, Chen and Ni, Renkun and Xu, Zheng and Kong, Kezhi and Huang, W. Ronny and Goldstein, Tom},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {GradInit: Learning to Initialize Neural Networks for Stable and Efficient Training},
  editor         = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
  pages          = {16410--16422},
  publisher      = {Curran Associates, Inc.},
  url            = {https://proceedings.neurips.cc/paper_files/paper/2021/file/88ae6372cfdc5df69a976e893f4d554b-Paper.pdf},
  volume         = {34},
  file           = {:Zhu2021 - GradInit_ Learning to Initialize Neural Networks for Stable and Efficient Training.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Skorski2021,
  author         = {Skorski, Maciej and Temperoni, Alessandro and Theobald, Martin},
  booktitle      = {Proceedings of The 13th Asian Conference on Machine Learning},
  title          = {Revisiting Weight Initialization of Deep Neural Networks},
  editor         = {Balasubramanian, Vineeth N. and Tsang, Ivor},
  pages          = {1192--1207},
  publisher      = {PMLR},
  series         = {Proceedings of Machine Learning Research},
  url            = {https://proceedings.mlr.press/v157/skorski21a.html},
  volume         = {157},
  file           = {:Skorski2021 - Revisiting Weight Initialization of Deep Neural Networks.pdf:PDF},
  month          = {17--19 Nov},
  pdf            = {https://proceedings.mlr.press/v157/skorski21a/skorski21a.pdf},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Schuerholt2022,
  author         = {Schürholt, Konstantin and Knyazev, Boris and Giró-i-Nieto, Xavier and Borth, Damian},
  date           = {2022-09-29},
  title          = {Hyper-Representations as Generative Models: Sampling Unseen Neural Network Weights},
  doi            = {10.48550/ARXIV.2209.14733},
  eprint         = {2209.14733},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Schuerholt2022 - Hyper Representations As Generative Models_ Sampling Unseen Neural Network Weights.pdf:PDF:http\://arxiv.org/pdf/2209.14733v1},
  keywords       = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Das2021,
  author         = {Das, Debasmit and Bhalgat, Yash and Porikli, Fatih},
  date           = {2021-05-02},
  title          = {Data-driven Weight Initialization with Sylvester Solvers},
  doi            = {10.48550/ARXIV.2105.10335},
  eprint         = {2105.10335},
  eprintclass    = {cs.NE},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Das2021 - Data Driven Weight Initialization with Sylvester Solvers.pdf:PDF:http\://arxiv.org/pdf/2105.10335v1},
  keywords       = {Neural and Evolutionary Computing (cs.NE), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Yang2022c,
  author         = {Yang, Yibo and Wang, Hong and Yuan, Haobo and Lin, Zhouchen},
  date           = {2022-10-12},
  title          = {Towards Theoretically Inspired Neural Initialization Optimization},
  doi            = {10.48550/ARXIV.2210.05956},
  eprint         = {2210.05956},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Yang2022c - Towards Theoretically Inspired Neural Initialization Optimization.pdf:PDF:http\://arxiv.org/pdf/2210.05956v1},
  keywords       = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Geiping2022,
  author         = {Geiping, Jonas and Goldblum, Micah and Somepalli, Gowthami and Shwartz-Ziv, Ravid and Goldstein, Tom and Wilson, Andrew Gordon},
  date           = {2022-10-12},
  title          = {How Much Data Are Augmentations Worth? An Investigation into Scaling Laws, Invariance, and Implicit Regularization},
  doi            = {10.48550/ARXIV.2210.06441},
  eprint         = {2210.06441},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  note           = {under review for ICLR 2023},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Geiping2022 - How Much Data Are Augmentations Worth_ an Investigation into Scaling Laws, Invariance, and Implicit Regularization.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Xu2023a,
  author         = {Xu, Jiarui and Liu, Sifei and Vahdat, Arash and Byeon, Wonmin and Wang, Xiaolong and De Mello, Shalini},
  date           = {2023-03-08},
  title          = {Open-Vocabulary Panoptic Segmentation with Text-to-Image Diffusion Models},
  doi            = {10.48550/ARXIV.2303.04803},
  eprint         = {2303.04803},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Xu2023a - Open Vocabulary Panoptic Segmentation with Text to Image Diffusion Models.pdf:PDF:http\://arxiv.org/pdf/2303.04803v4},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Min2022,
  author         = {Min, Seonwoo and Park, Nokyung and Kim, Siwon and Park, Seunghyun and Kim, Jinkyu},
  date           = {2022-07-21},
  title          = {Grounding Visual Representations with Texts for Domain Generalization},
  doi            = {10.48550/ARXIV.2207.10285},
  eprint         = {2207.10285},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Min2022 - Grounding Visual Representations with Texts for Domain Generalization.pdf:PDF:http\://arxiv.org/pdf/2207.10285v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Plummer2015,
  author         = {Plummer, Bryan A. and Wang, Liwei and Cervantes, Chris M. and Caicedo, Juan C. and Hockenmaier, Julia and Lazebnik, Svetlana},
  date           = {2015-05-19},
  title          = {Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models},
  doi            = {10.48550/ARXIV.1505.04870},
  eprint         = {1505.04870},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Plummer2015 - Flickr30k Entities_ Collecting Region to Phrase Correspondences for Richer Image to Sentence Models.pdf:PDF:http\://arxiv.org/pdf/1505.04870v4},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2015},
}

@Article{Frankle2018,
  author         = {Frankle, Jonathan and Carbin, Michael},
  date           = {2018-03-09},
  journaltitle   = {ICLR 2019},
  title          = {The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks},
  doi            = {10.48550/ARXIV.1803.03635},
  eprint         = {1803.03635},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Frankle2018 - The Lottery Ticket Hypothesis_ Finding Sparse, Trainable Neural Networks.pdf:PDF:http\://arxiv.org/pdf/1803.03635v5},
  keywords       = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2018},
}

@Article{Lee2024a,
  author         = {Lee, Hojun and Kim, Suyoung and Lee, Junhoo and Yoo, Jaeyoung and Kwak, Nojun},
  date           = {2024-04-14},
  title          = {Coreset Selection for Object Detection},
  doi            = {10.48550/ARXIV.2404.09161},
  eprint         = {2404.09161},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Lee2024a - Coreset Selection for Object Detection.pdf:PDF:http\://arxiv.org/pdf/2404.09161v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Wu2023,
  author      = {Wu, Xindi and Zhang, Byron and Deng, Zhiwei and Russakovsky, Olga},
  date        = {2023-08-15},
  title       = {Vision-Language Dataset Distillation},
  doi         = {10.48550/ARXIV.2308.07545},
  eprint      = {2308.07545},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Wu2023 - Vision Language Dataset Distillation.pdf:PDF:http\://arxiv.org/pdf/2308.07545v3},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Nakkiran2020,
  author         = {Preetum Nakkiran and Gal Kaplun and Yamini Bansal and Tristan Yang and Boaz Barak and Ilya Sutskever},
  booktitle      = {International Conference on Learning Representations},
  title          = {Deep Double Descent: Where Bigger Models and More Data Hurt},
  eprint         = {1912.02292},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  publisher      = {arXiv},
  url            = {https://openreview.net/forum?id=B1g5sA4twr},
  file           = {:Nakkiran2020 - Deep Double Descent_ Where Bigger Models and More Data Hurt.pdf:PDF:http\://arxiv.org/pdf/1912.02292v1},
  keywords       = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), Neural and Evolutionary Computing (cs.NE), Machine Learning (stat.ML), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@Article{Laurencon2024,
  author      = {Laurençon, Hugo and Tronchon, Léo and Cord, Matthieu and Sanh, Victor},
  date        = {2024-05-03},
  title       = {What matters when building vision-language models?},
  doi         = {10.48550/ARXIV.2405.02246},
  eprint      = {2405.02246},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Laurencon2024 - What Matters When Building Vision Language Models_.pdf:PDF:http\://arxiv.org/pdf/2405.02246v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Cheekati2024,
  author      = {Cheekati, Shravan},
  date        = {2024-05-02},
  title       = {Early Transformers: A study on Efficient Training of Transformer Models through Early-Bird Lottery Tickets},
  doi         = {10.48550/ARXIV.2405.02353},
  eprint      = {2405.02353},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Share Alike 4.0 International},
  file        = {:Cheekati2024 - Early Transformers_ a Study on Efficient Training of Transformer Models through Early Bird Lottery Tickets.pdf:PDF:http\://arxiv.org/pdf/2405.02353v1},
  keywords    = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Lavoie2024,
  author      = {Lavoie, Samuel and Kirichenko, Polina and Ibrahim, Mark and Assran, Mahmoud and Wildon, Andrew Gordon and Courville, Aaron and Ballas, Nicolas},
  date        = {2024-04-30},
  title       = {Modeling Caption Diversity in Contrastive Vision-Language Pretraining},
  doi         = {10.48550/ARXIV.2405.00740},
  eprint      = {2405.00740},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Lavoie2024 - Modeling Caption Diversity in Contrastive Vision Language Pretraining.pdf:PDF:http\://arxiv.org/pdf/2405.00740v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Schaeffer2023a,
  author         = {Schaeffer, Rylan and Khona, Mikail and Robertson, Zachary and Boopathy, Akhilan and Pistunova, Kateryna and Rocks, Jason W. and Fiete, Ila Rani and Koyejo, Oluwasanmi},
  date           = {2023-03-24},
  title          = {Double Descent Demystified: Identifying, Interpreting & Ablating the Sources of a Deep Learning Puzzle},
  doi            = {10.48550/ARXIV.2303.14151},
  eprint         = {2303.14151},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file           = {:Schaeffer2023a - Double Descent Demystified_ Identifying, Interpreting & Ablating the Sources of a Deep Learning Puzzle.pdf:PDF},
  keywords       = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Curth2023,
  author      = {Curth, Alicia and Jeffares, Alan and van der Schaar, Mihaela},
  date        = {2023-10-29},
  title       = {A U-turn on Double Descent: Rethinking Parameter Counting in Statistical Learning},
  doi         = {10.48550/ARXIV.2310.18988},
  eprint      = {2310.18988},
  eprintclass = {stat.ML},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Curth2023 - A U Turn on Double Descent_ Rethinking Parameter Counting in Statistical Learning.pdf:PDF},
  keywords    = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Patro2023a,
  author         = {Patro, Badri N. and Agneeswaran, Vijay Srinivas},
  date           = {2023-11-02},
  title          = {Scattering Vision Transformer: Spectral Mixing Matters},
  doi            = {10.48550/ARXIV.2311.01310},
  eprint         = {2311.01310},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file           = {:Patro2023a - Scattering Vision Transformer_ Spectral Mixing Matters.pdf:PDF:http\://arxiv.org/pdf/2311.01310v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Image and Video Processing (eess.IV), Signal Processing (eess.SP), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@InProceedings{Guibas2022,
  author         = {John Guibas and Morteza Mardani and Zongyi Li and Andrew Tao and Anima Anandkumar and Bryan Catanzaro},
  booktitle      = {International Conference on Learning Representations},
  title          = {Efficient Token Mixing for Transformers via Adaptive Fourier Neural Operators},
  eprint         = {2111.13587},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  url            = {https://openreview.net/forum?id=EXHG-A3jlM},
  file           = {:Guibas2021 - Adaptive Fourier Neural Operators_ Efficient Token Mixers for Transformers.pdf:PDF:http\://arxiv.org/pdf/2111.13587v2},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Patro2023b,
  author         = {Patro, Badri N. and Namboodiri, Vinay P. and Agneeswaran, Vijay Srinivas},
  date           = {2023-04-13},
  title          = {SpectFormer: Frequency and Attention is what you need in a Vision Transformer},
  doi            = {10.48550/ARXIV.2304.06446},
  eprint         = {2304.06446},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file           = {:Patro2023b - SpectFormer_ Frequency and Attention Is What You Need in a Vision Transformer.pdf:PDF:http\://arxiv.org/pdf/2304.06446v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Beck2024,
  author         = {Beck, Maximilian and Pöppel, Korbinian and Spanring, Markus and Auer, Andreas and Prudnikova, Oleksandra and Kopp, Michael and Klambauer, Günter and Brandstetter, Johannes and Hochreiter, Sepp},
  date           = {2024-05-07},
  title          = {xLSTM: Extended Long Short-Term Memory},
  doi            = {10.48550/ARXIV.2405.04517},
  eprint         = {2405.04517},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Beck2024 - XLSTM_ Extended Long Short Term Memory.pdf:PDF:http\://arxiv.org/pdf/2405.04517v1},
  keywords       = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Babiloni2023,
  author         = {Francesca Babiloni and Ioannis Marras and Jiankang Deng and Filippos Kokkinos and Matteo Maggioni and Grigorios Chrysos and Philip Torr and Stefanos Zafeiriou},
  title          = {Linear Complexity Self-Attention with 3rd Order Polynomials},
  doi            = {10.1109/tpami.2022.3231971},
  pages          = {1--12},
  file           = {:Babiloni2023 - Linear Complexity Self Attention with 3rd Order Polynomials.pdf:PDF;:Babiloni2023 - Linear Complexity Self Attention with _3$$_text$$rd$$$$_ Order Polynomials.pdf:PDF},
  journal        = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
  keywords       = {self-attention, non-local blocks, transformers, polynomial expansion, neural networks},
  publisher      = {Institute of Electrical and Electronics Engineers ({IEEE})},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Yu2024,
  author      = {Yu, Weihao and Wang, Xinchao},
  date        = {2024-05-13},
  title       = {MambaOut: Do We Really Need Mamba for Vision?},
  doi         = {10.48550/ARXIV.2405.07992},
  eprint      = {2405.07992},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Yu2024 - MambaOut_ Do We Really Need Mamba for Vision_.pdf:PDF:http\://arxiv.org/pdf/2405.07992v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Wei2024,
  author      = {Wei, Zihao and Pan, Zixuan and Owens, Andrew},
  date        = {2024-05-14},
  title       = {Efficient Vision-Language Pre-training by Cluster Masking},
  doi         = {10.48550/ARXIV.2405.08815},
  eprint      = {2405.08815},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {We propose a simple strategy for masking image patches during visual-language contrastive learning that improves the quality of the learned representations and the training speed. During each iteration of training, we randomly mask clusters of visually similar image patches, as measured by their raw pixel intensities. This provides an extra learning signal, beyond the contrastive training itself, since it forces a model to predict words for masked visual structures solely from context. It also speeds up training by reducing the amount of data used in each image. We evaluate the effectiveness of our model by pre-training on a number of benchmarks, finding that it outperforms other masking strategies, such as FLIP, on the quality of the learned representation.},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Wei2024 - Efficient Vision Language Pre Training by Cluster Masking.pdf:PDF:http\://arxiv.org/pdf/2405.08815v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Vasu2024,
  author         = {Vasu, Pavan Kumar Anasosalu and Pouransari, Hadi and Faghri, Fartash and Tuzel, Oncel},
  date           = {2024-05-14},
  title          = {CLIP with Quality Captions: A Strong Pretraining for Vision Tasks},
  doi            = {10.48550/ARXIV.2405.08911},
  eprint         = {2405.08911},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Vasu2024 - CLIP with Quality Captions_ a Strong Pretraining for Vision Tasks.pdf:PDF:http\://arxiv.org/pdf/2405.08911v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Vasu2023,
  author         = {Vasu, Pavan Kumar Anasosalu and Pouransari, Hadi and Faghri, Fartash and Vemulapalli, Raviteja and Tuzel, Oncel},
  date           = {2023-11-28},
  title          = {MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced Training},
  doi            = {10.48550/ARXIV.2311.17049},
  eprint         = {2311.17049},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  file           = {:Vasu2023 - MobileCLIP_ Fast Image Text Models through Multi Modal Reinforced Training.pdf:PDF:http\://arxiv.org/pdf/2311.17049v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Pan2024,
  author      = {Pan, Xu and Philip, Aaron and Xie, Ziqian and Schwartz, Odelia},
  date        = {2024-04-04},
  title       = {Dissecting Query-Key Interaction in Vision Transformers},
  doi         = {10.48550/ARXIV.2405.14880},
  eprint      = {2405.14880},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Pan2024 - Dissecting Query Key Interaction in Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2405.14880v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Yadkori2024,
  author         = {Yadkori, Yasin Abbasi and Kuzborskij, Ilja and György, András and Szepesvári, Csaba},
  date           = {2024-06-04},
  title          = {To Believe or Not to Believe Your LLM},
  doi            = {10.48550/ARXIV.2406.02543},
  eprint         = {2406.02543},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Yadkori2024 - To Believe or Not to Believe Your LLM.pdf:PDF:http\://arxiv.org/pdf/2406.02543v1},
  keywords       = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Zhang2024a,
  author      = {Zhang, Shuoxi and Liu, Hanpeng and Lin, Stephen and He, Kun},
  date        = {2024-06-01},
  title       = {You Only Need Less Attention at Each Stage in Vision Transformers},
  doi         = {10.48550/ARXIV.2406.00427},
  eprint      = {2406.00427},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhang2024a - You Only Need Less Attention at Each Stage in Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2406.00427v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Rezaei2024,
  author         = {Rezaei, Razieh and Sabet, Masoud Jalili and Gu, Jindong and Rueckert, Daniel and Torr, Philip and Khakzar, Ashkan},
  date           = {2024-06-05},
  title          = {Learning Visual Prompts for Guiding the Attention of Vision Transformers},
  doi            = {10.48550/ARXIV.2406.03303},
  eprint         = {2406.03303},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Rezaei2024 - Learning Visual Prompts for Guiding the Attention of Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2406.03303v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@InProceedings{Park2024,
  author    = {Sungho Park and Hyeran Byun},
  booktitle = {Computer Vision and Pattern Recognition (CVPR)},
  title     = {{Fair-VPT:} Fair Visual Prompt Tuning for Image Classification},
  file      = {:Park2024 - Fair VPT_ Fair Visual Prompt Tuning for Image Classification.pdf:PDF},
  priority  = {prio3},
  year      = {2024},
}

@InProceedings{Hu2024CVPR,
  author    = {Xinting Hu and Li Jiang and Bernt Schiele},
  booktitle = {Computer Vision and Pattern Recognition (CVPR)},
  title     = {Training Vision Transformers for Semi-Supervised Semantic Segmentation},
  file      = {:Hu2024CVPR - Training Vision Transformers for Semi Supervised Semantic Segmentation.pdf:PDF},
  priority  = {prio3},
  year      = {2024},
}

@Article{Yang2024b,
  author      = {Yang, Songlin and Wang, Bailin and Zhang, Yu and Shen, Yikang and Kim, Yoon},
  date        = {2024-06-10},
  title       = {Parallelizing Linear Transformers with the Delta Rule over Sequence Length},
  doi         = {10.48550/ARXIV.2406.06484},
  eprint      = {2406.06484},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Zero v1.0 Universal},
  file        = {:Yang2024b - Parallelizing Linear Transformers with the Delta Rule Over Sequence Length.pdf:PDF:http\://arxiv.org/pdf/2406.06484v1},
  keywords    = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Mahmoud2023,
  author         = {Mahmoud, Anas and Elhoushi, Mostafa and Abbas, Amro and Yang, Yu and Ardalani, Newsha and Leather, Hugh and Morcos, Ari},
  date           = {2023-10-03},
  title          = {Sieve: Multimodal Dataset Pruning Using Image Captioning Models},
  doi            = {10.48550/ARXIV.2310.02110},
  eprint         = {2310.02110},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Mahmoud2023 - Sieve_ Multimodal Dataset Pruning Using Image Captioning Models.pdf:PDF:http\://arxiv.org/pdf/2310.02110v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Nguyen2024,
  author         = {Nguyen, Duy-Kien and Assran, Mahmoud and Jain, Unnat and Oswald, Martin R. and Snoek, Cees G. M. and Chen, Xinlei},
  date           = {2024-06-13},
  title          = {An Image is Worth More Than 16x16 Patches: Exploring Transformers on Individual Pixels},
  doi            = {10.48550/ARXIV.2406.09415},
  eprint         = {2406.09415},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Nguyen2024 - An Image Is Worth More Than 16x16 Patches_ Exploring Transformers on Individual Pixels.pdf:PDF:http\://arxiv.org/pdf/2406.09415v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Jiang2024,
  author         = {Jiang, Zhixing and Yin, Dennis and Khoda, Elham E and Loncar, Vladimir and Govorkova, Ekaterina and Moreno, Eric and Harris, Philip and Hauck, Scott and Hsu, Shih-Chieh},
  date           = {2024-02-01},
  journaltitle   = {Machine Learning and the Physical Sciences Workshop, NeurIPS 2023},
  title          = {Ultra Fast Transformers on FPGAs for Particle Physics Experiments},
  doi            = {10.48550/ARXIV.2402.01047},
  eprint         = {2402.01047},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Jiang2024 - Ultra Fast Transformers on FPGAs for Particle Physics Experiments.pdf:PDF:http\://arxiv.org/pdf/2402.01047v1},
  keywords       = {Machine Learning (cs.LG), Hardware Architecture (cs.AR), High Energy Physics - Experiment (hep-ex), FOS: Computer and information sciences, FOS: Physical sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@InProceedings{Li2020,
  author         = {Li, Bingbing and Pandey, Santosh and Fang, Haowen and Lyv, Yanjun and Li, Ji and Chen, Jieyang and Xie, Mimi and Wan, Lipeng and Liu, Hang and Ding, Caiwen},
  booktitle      = {Proceedings of the ACM/IEEE International Symposium on Low Power Electronics and Design},
  title          = {Ftrans: energy-efficient acceleration of transformers using fpga},
  eprint         = {2007.08563},
  eprintclass    = {cs.DC},
  eprinttype     = {arXiv},
  pages          = {175--180},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Li2020 - FTRANS_ Energy Efficient Acceleration of Transformers Using FPGA.pdf:PDF:http\://arxiv.org/pdf/2007.08563v1},
  keywords       = {Distributed / Parallel / Cluster Computing (cs.DC), Machine Learning (cs.LG), FOS: Computer and information sciences, C.1.4},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@InProceedings{Peng2021b,
  author         = {Peng, Hongwu and Huang, Shaoyi and Geng, Tong and Li, Ang and Jiang, Weiwen and Liu, Hang and Wang, Shusen and Ding, Caiwen},
  booktitle      = {2021 22nd International Symposium on Quality Electronic Design (ISQED)},
  title          = {Accelerating Transformer-based Deep Learning Models on FPGAs using Column Balanced Block Pruning},
  doi            = {10.1109/ISQED51717.2021.9424344},
  pages          = {142-148},
  file           = {:Peng2021b - Accelerating Transformer Based Deep Learning Models on FPGAs Using Column Balanced Block Pruning.pdf:PDF},
  keywords       = {Training;Deep learning;Graphics processing units;Parallel processing;Hardware;Natural language processing;Sparse matrices;Transformer;deep learning;pruning;acceleration;FPGA},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Devaguptapu2024,
  author      = {Devaguptapu, Chaitanya and Aithal, Sumukh and Ramasubramanian, Shrinivas and Yamada, Moyuru and Kaul, Manohar},
  date        = {2024-06-18},
  title       = {Semantic Graph Consistency: Going Beyond Patches for Regularizing Self-Supervised Vision Transformers},
  doi         = {10.48550/ARXIV.2406.12944},
  eprint      = {2406.12944},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Devaguptapu2024 - Semantic Graph Consistency_ Going beyond Patches for Regularizing Self Supervised Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2406.12944v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Cheon2024,
  author      = {Cheon, Minjong},
  date        = {2024-06-21},
  title       = {Demonstrating the Efficacy of Kolmogorov-Arnold Networks in Vision Tasks},
  doi         = {10.48550/ARXIV.2406.14916},
  eprint      = {2406.14916},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Cheon2024 - Demonstrating the Efficacy of Kolmogorov Arnold Networks in Vision Tasks.pdf:PDF:http\://arxiv.org/pdf/2406.14916v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Huang2024,
  author      = {Huang, Brandon and Mitra, Chancharik and Arbelle, Assaf and Karlinsky, Leonid and Darrell, Trevor and Herzig, Roei},
  date        = {2024-06-21},
  title       = {Multimodal Task Vectors Enable Many-Shot Multimodal In-Context Learning},
  doi         = {10.48550/ARXIV.2406.15334},
  eprint      = {2406.15334},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Huang2024 - Multimodal Task Vectors Enable Many Shot Multimodal in Context Learning.pdf:PDF:http\://arxiv.org/pdf/2406.15334v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Lu2021,
  author         = {Lu, Jiachen and Yao, Jinghan and Zhang, Junge and Zhu, Xiatian and Xu, Hang and Gao, Weiguo and Xu, Chunjing and Xiang, Tao and Zhang, Li},
  date           = {2021-10-22},
  title          = {SOFT: Softmax-free Transformer with Linear Complexity},
  doi            = {10.48550/ARXIV.2110.11945},
  eprint         = {2110.11945},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Lu2021 - SOFT_ Softmax Free Transformer with Linear Complexity.pdf:PDF:http\://arxiv.org/pdf/2110.11945v3},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Lou2024,
  author      = {Lou, Chao and Jia, Zixia and Zheng, Zilong and Tu, Kewei},
  date        = {2024-06-24},
  title       = {Sparser is Faster and Less is More: Efficient Sparse Attention for Long-Range Transformers},
  doi         = {10.48550/ARXIV.2406.16747},
  eprint      = {2406.16747},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Lou2024 - Sparser Is Faster and Less Is More_ Efficient Sparse Attention for Long Range Transformers.pdf:PDF:http\://arxiv.org/pdf/2406.16747v1},
  keywords    = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@InProceedings{You2023,
  author         = {You, Haoran and Shi, Huihong and Guo, Yipin and Lin, Yingyan},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {ShiftAddViT: Mixture of Multiplication Primitives Towards Efficient Vision Transformer},
  editor         = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
  pages          = {33319--33337},
  publisher      = {Curran Associates, Inc.},
  url            = {https://proceedings.neurips.cc/paper_files/paper/2023/file/69c49f75ca31620f1f0d38093d9f3d9b-Paper-Conference.pdf},
  volume         = {36},
  file           = {:You2023 - ShiftAddViT_ Mixture of Multiplication Primitives Towards Efficient Vision Transformer.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Han2023a,
  author         = {Han, Dongchen and Pan, Xuran and Han, Yizeng and Song, Shiji and Huang, Gao},
  date           = {2023-08-01},
  title          = {FLatten Transformer: Vision Transformer using Focused Linear Attention},
  doi            = {10.48550/ARXIV.2308.00442},
  eprint         = {2308.00442},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Han2023a - FLatten Transformer_ Vision Transformer Using Focused Linear Attention.pdf:PDF:http\://arxiv.org/pdf/2308.00442v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Nandam2024,
  author      = {Nandam, Srinivasa Rao and Atito, Sara and Feng, Zhenhua and Kittler, Josef and Awais, Muhammad},
  date        = {2024-06-25},
  title       = {Investigating Self-Supervised Methods for Label-Efficient Learning},
  doi         = {10.48550/ARXIV.2406.17460},
  eprint      = {2406.17460},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Nandam2024 - Investigating Self Supervised Methods for Label Efficient Learning.pdf:PDF:http\://arxiv.org/pdf/2406.17460v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@InProceedings{Guo2024,
  author         = {Guo, Jialong and Chen, Xinghao and Tang, Yehui and Wang, Yunhe},
  booktitle      = {International Conference on Machine Learning},
  title          = {SLAB: Efficient Transformers with Simplified Linear Attention and Progressive Re-parameterized Batch Normalization},
  eprint         = {2405.11582},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  file           = {:Guo2024 - SLAB_ Efficient Transformers with Simplified Linear Attention and Progressive Re Parameterized Batch Normalization.pdf:PDF:http\://arxiv.org/pdf/2405.11582v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@InProceedings{Ma2024,
  author      = {Xu Ma and Xiyang Dai and Jianwei Yang and Bin Xiao and Yinpeng Chen and Yun Fu and Lu Yuan},
  booktitle   = {The Twelfth International Conference on Learning Representations},
  title       = {Efficient Modulation for Vision Networks},
  eprint      = {2403.19963},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  url         = {https://openreview.net/forum?id=ip5LHJs6QX},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Ma2024 - Efficient Modulation for Vision Networks.pdf:PDF:http\://arxiv.org/pdf/2403.19963v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  year        = {2024},
}

@Article{Hu2024,
  author         = {Hu, Youbing and Cheng, Yun and Lu, Anqi and Cao, Zhiqiang and Wei, Dawei and Liu, Jie and Li, Zhijun},
  date           = {2024-01-08},
  title          = {LF-ViT: Reducing Spatial Redundancy in Vision Transformer for Efficient Image Recognition},
  doi            = {10.48550/ARXIV.2402.00033},
  eprint         = {2402.00033},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:http\://arxiv.org/pdf/2402.00033v1:PDF},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Wang2024,
  author         = {Wang, Qian-Wei and Xie, Yuqiu and Zhang, Letian and Liu, Zimo and Xia, Shu-Tao},
  date           = {2024-05-23},
  title          = {Pre-Trained Vision-Language Models as Partial Annotators},
  doi            = {10.48550/ARXIV.2406.18550},
  eprint         = {2406.18550},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file           = {:Wang2024 - Pre Trained Vision Language Models As Partial Annotators.pdf:PDF:http\://arxiv.org/pdf/2406.18550v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Xu2023b,
  author         = {Xu, Xuwei and Wang, Sen and Chen, Yudong and Zheng, Yanping and Wei, Zhewei and Liu, Jiajun},
  date           = {2023-11-06},
  title          = {GTP-ViT: Efficient Vision Transformers via Graph-based Token Propagation},
  doi            = {10.48550/ARXIV.2311.03035},
  eprint         = {2311.03035},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  note           = {WACV2024 oral},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Xu2023b - GTP ViT_ Efficient Vision Transformers Via Graph Based Token Propagation.pdf:PDF:http\://arxiv.org/pdf/2311.03035v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@InProceedings{Vasu2023a,
  author         = {Vasu, Pavan Kumar Anasosalu and Gabriel, James and Zhu, Jeff and Tuzel, Oncel and Ranjan, Anurag},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title          = {FastViT: A Fast Hybrid Vision Transformer Using Structural Reparameterization},
  eprint         = {2303.14189},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  pages          = {5785-5795},
  file           = {:Vasu2023a - FastViT_ a Fast Hybrid Vision Transformer Using Structural Reparameterization.pdf:PDF:http\://arxiv.org/pdf/2303.14189v2},
  month          = {October},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@InProceedings{Shaker2023,
  author         = {Shaker, Abdelrahman and Maaz, Muhammad and Rasheed, Hanoona and Khan, Salman and Yang, Ming-Hsuan and Khan, Fahad Shahbaz},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title          = {SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications},
  eprint         = {2303.15446},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  file           = {:Shaker2023 - SwiftFormer_ Efficient Additive Attention for Transformer Based Real Time Mobile Vision Applications.pdf:PDF:http\://arxiv.org/pdf/2303.15446v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Wang2023,
  author      = {Wang, Ao and Chen, Hui and Lin, Zijia and Han, Jungong and Ding, Guiguang},
  date        = {2023-07-18},
  title       = {RepViT: Revisiting Mobile CNN From ViT Perspective},
  doi         = {10.48550/ARXIV.2307.09283},
  eprint      = {2307.09283},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  note        = {CVPR 2024},
  file        = {:Wang2023 - RepViT_ Revisiting Mobile CNN from ViT Perspective.pdf:PDF:http\://arxiv.org/pdf/2307.09283v8},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Zhang2021,
  author         = {Qinglong Zhang and Yu-Bin Yang},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {ResT: An Efficient Transformer for Visual Recognition},
  editor         = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan},
  eprint         = {2105.13677},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  publisher      = {arXiv},
  url            = {https://openreview.net/forum?id=6Ab68Ip4Mu},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Zhang2021 - ResT_ an Efficient Transformer for Visual Recognition.pdf:PDF:http\://arxiv.org/pdf/2105.13677v5},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Beyer2020,
  author         = {Beyer, Lucas and Hénaff, Olivier J. and Kolesnikov, Alexander and Zhai, Xiaohua and Oord, Aäron van den},
  date           = {2020-06-12},
  title          = {Are we done with ImageNet?},
  doi            = {10.48550/ARXIV.2006.07159},
  eprint         = {2006.07159},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Beyer2020 - Are We Done with ImageNet_.pdf:PDF:http\://arxiv.org/pdf/2006.07159v1},
  groups         = {Coreset for FL},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@InProceedings{Bolya2022,
  author         = {Bolya, Daniel and Fu, Cheng-Yang and Dai, Xiaoliang and Zhang, Peizhao and Hoffman, Judy},
  booktitle      = {Computer Vision -- ECCV 2022 Workshops},
  title          = {Hydra Attention: Efficient Attention with Many Heads},
  editor         = {Karlinsky, Leonid and Michaeli, Tomer and Nishino, Ko},
  isbn           = {978-3-031-25082-8},
  pages          = {35--49},
  publisher      = {Springer Nature Switzerland},
  address        = {Cham},
  file           = {:Bolya2022 - Hydra Attention_ Efficient Attention with Many Heads.pdf:PDF:http\://arxiv.org/pdf/2209.07484v1},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Zhang2023c,
  author         = {Xiaosong Zhang and Yunjie Tian and Lingxi Xie and Wei Huang and Qi Dai and Qixiang Ye and Qi Tian},
  booktitle      = {The Eleventh International Conference on Learning Representations},
  title          = {HiViT: A Simpler and More Efficient Design of Hierarchical Vision Transformer},
  eprint         = {2205.14949},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  url            = {https://openreview.net/forum?id=3F6I-0-57SC},
  file           = {:Zhang2022b - HiViT_ Hierarchical Vision Transformer Meets Masked Image Modeling.pdf:PDF:http\://arxiv.org/pdf/2205.14949v1},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Li2022b,
  author         = {Li, Jiashi and Xia, Xin and Li, Wei and Li, Huixia and Wang, Xing and Xiao, Xuefeng and Wang, Rui and Zheng, Min and Pan, Xin},
  date           = {2022-07-12},
  title          = {Next-ViT: Next Generation Vision Transformer for Efficient Deployment in Realistic Industrial Scenarios},
  doi            = {10.48550/ARXIV.2207.05501},
  eprint         = {2207.05501},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Li2022b - Next ViT_ Next Generation Vision Transformer for Efficient Deployment in Realistic Industrial Scenarios.pdf:PDF:http\://arxiv.org/pdf/2207.05501v4},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Cai2023,
  author         = {Cai, Han and Li, Junyan and Hu, Muyan and Gan, Chuang and Han, Song},
  booktitle      = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title          = {EfficientViT: Lightweight Multi-Scale Attention for High-Resolution Dense Prediction},
  eprint         = {2205.14756},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  pages          = {17302-17313},
  file           = {:Cai2022 - EfficientViT_ Multi Scale Linear Attention for High Resolution Dense Prediction.pdf:PDF:http\://arxiv.org/pdf/2205.14756v6},
  month          = {October},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@InProceedings{Zhou2021,
  author         = {Haoyi Zhou and Shanghang Zhang and Jieqi Peng and Shuai Zhang and Jianxin Li and Hui Xiong and Wancai Zhang},
  booktitle      = {Thirty-Fifth {AAAI} Conference on Artificial Intelligence, {AAAI} 2021, Thirty-Third Conference on Innovative Applications of Artificial Intelligence, {IAAI} 2021, The Eleventh Symposium on Educational Advances in Artificial Intelligence, {EAAI} 2021, Virtual Event, February 2-9, 2021},
  title          = {Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting},
  doi            = {10.1609/AAAI.V35I12.17325},
  eprint         = {2012.07436},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  pages          = {11106--11115},
  publisher      = {{AAAI} Press},
  url            = {https://doi.org/10.1609/aaai.v35i12.17325},
  file           = {:Zhou2020 - Informer_ beyond Efficient Transformer for Long Sequence Time Series Forecasting.pdf:PDF:http\://arxiv.org/pdf/2012.07436v3},
  keywords       = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Information Retrieval (cs.IR), FOS: Computer and information sciences},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Shao2024,
  author         = {Shao, Tong and Tian, Zhuotao and Zhao, Hang and Su, Jingyong},
  date           = {2024-07-11},
  title          = {Explore the Potential of CLIP for Training-Free Open Vocabulary Semantic Segmentation},
  doi            = {10.48550/ARXIV.2407.08268},
  eprint         = {2407.08268},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Shao2024 - Explore the Potential of CLIP for Training Free Open Vocabulary Semantic Segmentation.pdf:PDF:http\://arxiv.org/pdf/2407.08268v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2024},
}

@Article{Simoncini2024,
  author         = {Simoncini, Walter and Gidaris, Spyros and Bursuc, Andrei and Asano, Yuki M.},
  date           = {2024-07-15},
  title          = {No Train, all Gain: Self-Supervised Gradients Improve Deep Frozen Representations},
  doi            = {10.48550/ARXIV.2407.10964},
  eprint         = {2407.10964},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Simoncini2024 - No Train, All Gain_ Self Supervised Gradients Improve Deep Frozen Representations.pdf:PDF:http\://arxiv.org/pdf/2407.10964v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Wang2024a,
  author      = {Wang, Yancheng and Yang, Yingzhen},
  date        = {2024-07-21},
  title       = {Efficient Visual Transformer by Learnable Token Merging},
  doi         = {10.48550/ARXIV.2407.15219},
  eprint      = {2407.15219},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Wang2024a - Efficient Visual Transformer by Learnable Token Merging.pdf:PDF:http\://arxiv.org/pdf/2407.15219v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Chen2018,
  author         = {Chen, Ricky T. Q. and Rubanova, Yulia and Bettencourt, Jesse and Duvenaud, David},
  date           = {2018-06-19},
  title          = {Neural Ordinary Differential Equations},
  doi            = {10.48550/ARXIV.1806.07366},
  eprint         = {1806.07366},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Chen2018 - Neural Ordinary Differential Equations.pdf:PDF:http\://arxiv.org/pdf/1806.07366v5},
  keywords       = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2018},
}

@Article{Wang2024b,
  author      = {Wang, Haoqi and Zhang, Tong and Salzmann, Mathieu},
  date        = {2024-07-23},
  title       = {SINDER: Repairing the Singular Defects of DINOv2},
  doi         = {10.48550/ARXIV.2407.16826},
  eprint      = {2407.16826},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Wang2024b - SINDER_ Repairing the Singular Defects of DINOv2.pdf:PDF:http\://arxiv.org/pdf/2407.16826v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Zhang2024b,
  author      = {Zhang, Tianxiao and Xu, Wenju and Luo, Bo and Wang, Guanghui},
  date        = {2024-07-28},
  title       = {Depth-Wise Convolutions in Vision Transformers for Efficient Training on Small Datasets},
  doi         = {10.48550/ARXIV.2407.19394},
  eprint      = {2407.19394},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhang2024b - Depth Wise Convolutions in Vision Transformers for Efficient Training on Small Datasets.pdf:PDF:http\://arxiv.org/pdf/2407.19394v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Qi2023,
  author      = {Qi, Xianbiao and Wang, Jianan and Chen, Yihao and Shi, Yukai and Zhang, Lei},
  date        = {2023-04-19},
  title       = {LipsFormer: Introducing Lipschitz Continuity to Vision Transformers},
  doi         = {10.48550/ARXIV.2304.09856},
  eprint      = {2304.09856},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Qi2023 - LipsFormer_ Introducing Lipschitz Continuity to Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2304.09856v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Shalam2024,
  author      = {Shalam, Daniel and Korman, Simon},
  date        = {2024-08-04},
  title       = {Unsupervised Representation Learning by Balanced Self Attention Matching},
  doi         = {10.48550/ARXIV.2408.02014},
  eprint      = {2408.02014},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Shalam2024 - Unsupervised Representation Learning by Balanced Self Attention Matching.pdf:PDF:http\://arxiv.org/pdf/2408.02014v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  readstatus  = {read},
  year        = {2024},
}

@Article{Foret2020,
  author         = {Foret, Pierre and Kleiner, Ariel and Mobahi, Hossein and Neyshabur, Behnam},
  date           = {2020-10-03},
  title          = {Sharpness-Aware Minimization for Efficiently Improving Generalization},
  doi            = {10.48550/ARXIV.2010.01412},
  eprint         = {2010.01412},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Foret2020 - Sharpness Aware Minimization for Efficiently Improving Generalization.pdf:PDF:http\://arxiv.org/pdf/2010.01412v3},
  keywords       = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@Article{Gou2024,
  author         = {Gou, Chenhui and Felemban, Abdulwahab and Khan, Faizan Farooq and Zhu, Deyao and Cai, Jianfei and Rezatofighi, Hamid and Elhoseiny, Mohamed},
  date           = {2024-08-07},
  title          = {How Well Can Vision Language Models See Image Details?},
  doi            = {10.48550/ARXIV.2408.03940},
  eprint         = {2408.03940},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Gou2024 - How Well Can Vision Language Models See Image Details_.pdf:PDF:http\://arxiv.org/pdf/2408.03940v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Zhang2024c,
  author      = {Zhang, Tianfang and Li, Lei and Zhou, Yang and Liu, Wentao and Qian, Chen and Ji, Xiangyang},
  date        = {2024-08-07},
  title       = {CAS-ViT: Convolutional Additive Self-attention Vision Transformers for Efficient Mobile Applications},
  doi         = {10.48550/ARXIV.2408.03703},
  eprint      = {2408.03703},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Zhang2024c - CAS ViT_ Convolutional Additive Self Attention Vision Transformers for Efficient Mobile Applications.pdf:PDF:http\://arxiv.org/pdf/2408.03703v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Alper2024,
  author         = {Alper, Morris and Averbuch-Elor, Hadar},
  date           = {2024-07-11},
  title          = {Emergent Visual-Semantic Hierarchies in Image-Text Representations},
  doi            = {10.48550/ARXIV.2407.08521},
  eprint         = {2407.08521},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Alper2024 - Emergent Visual Semantic Hierarchies in Image Text Representations.pdf:PDF:http\://arxiv.org/pdf/2407.08521v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Jie2024,
  author      = {Jie, Shibo and Tang, Yehui and Guo, Jianyuan and Deng, Zhi-Hong and Han, Kai and Wang, Yunhe},
  date        = {2024-08-13},
  title       = {Token Compensator: Altering Inference Cost of Vision Transformer without Re-Tuning},
  doi         = {10.48550/ARXIV.2408.06798},
  eprint      = {2408.06798},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Zero v1.0 Universal},
  file        = {:Jie2024 - Token Compensator_ Altering Inference Cost of Vision Transformer without Re Tuning.pdf:PDF:http\://arxiv.org/pdf/2408.06798v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Misc{Ranzinger2023,
  author      = {Ranzinger, Mike and Heinrich, Greg and Kautz, Jan and Molchanov, Pavlo},
  date        = {2023-12-10},
  title       = {AM-RADIO: Agglomerative Vision Foundation Model -- Reduce All Domains Into One},
  doi         = {10.48550/ARXIV.2312.06709},
  eprint      = {2312.06709},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {efficient backbone, we evaluated numerous architectures in},
  booktitle   = {CVPR 2024 Conference Paper},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Ranzinger2023 - AM RADIO_ Agglomerative Vision Foundation Model Reduce All Domains into One.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2023},
}

@InProceedings{Zoph2020,
  author         = {Zoph, Barret and Ghiasi, Golnaz and Lin, Tsung-Yi and Cui, Yin and Liu, Hanxiao and Cubuk, Ekin Dogus and Le, Quoc},
  booktitle      = {Advances in Neural Information Processing Systems},
  title          = {Rethinking Pre-training and Self-training},
  editor         = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin},
  pages          = {3833--3845},
  publisher      = {Curran Associates, Inc.},
  url            = {https://proceedings.neurips.cc/paper_files/paper/2020/file/27e9661e033a73a6ad8cefcde965c54d-Paper.pdf},
  volume         = {33},
  file           = {:Zoph2020 - Rethinking Pre Training and Self Training.pdf:PDF},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@InProceedings{Pham2021,
  author         = {Pham, Hieu and Dai, Zihang and Xie, Qizhe and Le, Quoc V.},
  booktitle      = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title          = {Meta Pseudo Labels},
  pages          = {11557-11568},
  file           = {:Pham2021 - Meta Pseudo Labels.pdf:PDF},
  month          = {June},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@InProceedings{Raghu2021,
  author    = {Raghu, Aniruddh and Lorraine, Jonathan and Kornblith, Simon and McDermott, Matthew and Duvenaud, David K},
  booktitle = {Advances in Neural Information Processing Systems},
  title     = {Meta-learning to Improve Pre-training},
  editor    = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
  pages     = {23231--23244},
  publisher = {Curran Associates, Inc.},
  url       = {https://proceedings.neurips.cc/paper_files/paper/2021/file/c3810d4a9513b028fc0f2a83cb6d7b50-Paper.pdf},
  volume    = {34},
  file      = {:Raghu2021 - Meta Learning to Improve Pre Training.pdf:PDF},
  priority  = {prio2},
  year      = {2021},
}

@Article{Kage2024,
  author         = {Kage, Patrick and Rothenberger, Jay C. and Andreadis, Pavlos and Diochnos, Dimitrios I.},
  date           = {2024-08-13},
  title          = {A Review of Pseudo-Labeling for Computer Vision},
  doi            = {10.48550/ARXIV.2408.07221},
  eprint         = {2408.07221},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Kage2024 - A Review of Pseudo Labeling for Computer Vision.pdf:PDF:http\://arxiv.org/pdf/2408.07221v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences, I.2.0; I.5.4; I.4.0},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Xu2024,
  author         = {Moucheng Xu and Yukun Zhou and Chen Jin and Marius {de Groot} and Daniel C. Alexander and Neil P. Oxtoby and Yipeng Hu and Joseph Jacob},
  title          = {Expectation maximisation pseudo labels},
  doi            = {https://doi.org/10.1016/j.media.2024.103125},
  issn           = {1361-8415},
  pages          = {103125},
  url            = {https://www.sciencedirect.com/science/article/pii/S1361841524000501},
  volume         = {94},
  file           = {:Xu2024 - Expectation Maximisation Pseudo Labels.pdf:PDF},
  journal        = {Medical Image Analysis},
  keywords       = {Pseudo labels, Bayesian deep learning, Expectation–maximisation, Semi-supervised learning, Segmentation, Generative models, Robustness},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Jin2022,
  author         = {Jin, Zezhong and Zhong, Dading and Song, Xiao and Liu, Zhaoyi and Ye, Naipeng and Zeng, Qingcheng},
  date           = {2022-10-28},
  title          = {Filter and evolve: progressive pseudo label refining for semi-supervised automatic speech recognition},
  doi            = {10.48550/ARXIV.2210.16318},
  eprint         = {2210.16318},
  eprintclass    = {cs.SD},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Jin2022 - Filter and Evolve_ Progressive Pseudo Label Refining for Semi Supervised Automatic Speech Recognition.pdf:PDF:http\://arxiv.org/pdf/2210.16318v1},
  keywords       = {Sound (cs.SD), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@Article{Rothenberger2023,
  author         = {Rothenberger, Jay C. and Diochnos, Dimitrios I.},
  date           = {2023-11-29},
  title          = {Meta Co-Training: Two Views are Better than One},
  doi            = {10.48550/ARXIV.2311.18083},
  eprint         = {2311.18083},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file           = {:Rothenberger2023 - Meta Co Training_ Two Views Are Better Than One.pdf:PDF:http\://arxiv.org/pdf/2311.18083v4},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences, I.2.6; I.4.10},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Plested2022,
  author      = {Plested, Jo and Gedeon, Tom},
  date        = {2022-05-20},
  title       = {Deep transfer learning for image classification: a survey},
  doi         = {10.48550/ARXIV.2205.09904},
  eprint      = {2205.09904},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Plested2022 - Deep Transfer Learning for Image Classification_ a Survey.pdf:PDF:http\://arxiv.org/pdf/2205.09904v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2022},
}

@Article{Nguyen2024a,
  author      = {Nguyen, Khanh-Binh and Park, Chae Jung},
  date        = {2024-08-23},
  title       = {Symmetric masking strategy enhances the performance of Masked Image Modeling},
  doi         = {10.48550/ARXIV.2408.12772},
  eprint      = {2408.12772},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Nguyen2024a - Symmetric Masking Strategy Enhances the Performance of Masked Image Modeling.pdf:PDF:http\://arxiv.org/pdf/2408.12772v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Khan2024,
  author      = {Khan, Asifullah and Sohail, Anabia and Fiaz, Mustansar and Hassan, Mehdi and Afridi, Tariq Habib and Marwat, Sibghat Ullah and Munir, Farzeen and Ali, Safdar and Naseem, Hannan and Zaheer, Muhammad Zaigham and Ali, Kamran and Sultana, Tangina and Tanoli, Ziaurrehman and Akhter, Naeem},
  date        = {2024-08-30},
  title       = {A Survey of the Self Supervised Learning Mechanisms for Vision Transformers},
  doi         = {10.48550/ARXIV.2408.17059},
  eprint      = {2408.17059},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Khan2024 - A Survey of the Self Supervised Learning Mechanisms for Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2408.17059v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Wang2024c,
  author         = {Wang, Zhicai and Wei, Longhui and Wang, Tan and Chen, Heyu and Hao, Yanbin and Wang, Xiang and He, Xiangnan and Tian, Qi},
  date           = {2024-03-28},
  title          = {Enhance Image Classification via Inter-Class Image Mixup with Diffusion Model},
  doi            = {10.48550/ARXIV.2403.19600},
  eprint         = {2403.19600},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Zero v1.0 Universal},
  file           = {:Wang2024c - Enhance Image Classification Via Inter Class Image Mixup with Diffusion Model.pdf:PDF:http\://arxiv.org/pdf/2403.19600v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Islam2024,
  author         = {Islam, Khawar and Zaheer, Muhammad Zaigham and Mahmood, Arif and Nandakumar, Karthik},
  date           = {2024-04-05},
  title          = {DiffuseMix: Label-Preserving Data Augmentation with Diffusion Models},
  doi            = {10.48550/ARXIV.2405.14881},
  eprint         = {2405.14881},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Islam2024 - DiffuseMix_ Label Preserving Data Augmentation with Diffusion Models.pdf:PDF:http\://arxiv.org/pdf/2405.14881v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Belghazi2018,
  author      = {Belghazi, Mohamed Ishmael and Rajeswar, Sai and Mastropietro, Olivier and Rostamzadeh, Negar and Mitrovic, Jovana and Courville, Aaron},
  date        = {2018-02-04},
  title       = {Hierarchical Adversarially Learned Inference},
  doi         = {10.48550/ARXIV.1802.01071},
  eprint      = {1802.01071},
  eprintclass = {stat.ML},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Belghazi2018 - Hierarchical Adversarially Learned Inference.pdf:PDF:http\://arxiv.org/pdf/1802.01071v1},
  keywords    = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2018},
}

@Article{Verma2018,
  author      = {Verma, Vikas and Lamb, Alex and Beckham, Christopher and Najafi, Amir and Mitliagkas, Ioannis and Courville, Aaron and Lopez-Paz, David and Bengio, Yoshua},
  date        = {2018-06-13},
  title       = {Manifold Mixup: Better Representations by Interpolating Hidden States},
  doi         = {10.48550/ARXIV.1806.05236},
  eprint      = {1806.05236},
  eprintclass = {stat.ML},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Verma2018 - Manifold Mixup_ Better Representations by Interpolating Hidden States.pdf:PDF:http\://arxiv.org/pdf/1806.05236v7},
  keywords    = {Machine Learning (stat.ML), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2018},
}

@Article{Shen2020,
  author      = {Shen, Zhiqiang and Liu, Zechun and Liu, Zhuang and Savvides, Marios and Darrell, Trevor and Xing, Eric},
  date        = {2020-03-11},
  title       = {Un-Mix: Rethinking Image Mixtures for Unsupervised Visual Representation Learning},
  doi         = {10.48550/ARXIV.2003.05438},
  eprint      = {2003.05438},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Shen2020 - Un Mix_ Rethinking Image Mixtures for Unsupervised Visual Representation Learning.pdf:PDF:http\://arxiv.org/pdf/2003.05438v5},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2020},
}

@InProceedings{Beckham2019,
  author    = {Beckham, Christopher and Honari, Sina and Verma, Vikas and Lamb, Alex M and Ghadiri, Farnoosh and Hjelm, R Devon and Bengio, Yoshua and Pal, Chris},
  booktitle = {Advances in Neural Information Processing Systems},
  title     = {On Adversarial Mixup Resynthesis},
  editor    = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
  publisher = {Curran Associates, Inc.},
  url       = {https://proceedings.neurips.cc/paper_files/paper/2019/file/f708f064faaf32a43e4d3c784e6af9ea-Paper.pdf},
  volume    = {32},
  file      = {:Beckham2019 - On Adversarial Mixup Resynthesis.pdf:PDF},
  priority  = {prio2},
  year      = {2019},
}

@Article{Wang2024d,
  author      = {Wang, Yiheng and Lin, Jiayu and Lin, Zuoquan},
  date        = {2024-09-04},
  title       = {A Comparative Study of Pre-training and Self-training},
  doi         = {10.48550/ARXIV.2409.02751},
  eprint      = {2409.02751},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Wang2024d - A Comparative Study of Pre Training and Self Training.pdf:PDF:http\://arxiv.org/pdf/2409.02751v1},
  keywords    = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Knyazev2024,
  author      = {Knyazev, Boris and Moudgil, Abhinav and Lajoie, Guillaume and Belilovsky, Eugene and Lacoste-Julien, Simon},
  date        = {2024-09-06},
  title       = {Accelerating Training with Neuron Interaction and Nowcasting Networks},
  doi         = {10.48550/ARXIV.2409.04434},
  eprint      = {2409.04434},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Knyazev2024 - Accelerating Training with Neuron Interaction and Nowcasting Networks.pdf:PDF:http\://arxiv.org/pdf/2409.04434v1},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Machine Learning (stat.ML), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Ramapuram2024,
  author      = {Ramapuram, Jason and Danieli, Federico and Dhekane, Eeshan and Weers, Floris and Busbridge, Dan and Ablin, Pierre and Likhomanenko, Tatiana and Digani, Jagrit and Gu, Zijin and Shidani, Amitis and Webb, Russ},
  date        = {2024-09-06},
  title       = {Theory, Analysis, and Best Practices for Sigmoid Self-Attention},
  doi         = {10.48550/ARXIV.2409.04431},
  eprint      = {2409.04431},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Ramapuram2024 - Theory, Analysis, and Best Practices for Sigmoid Self Attention.pdf:PDF:http\://arxiv.org/pdf/2409.04431v1},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Yu2024a,
  author      = {Yu, Yonghao and Zhao, Dongcheng and Shen, Guobin and Dong, Yiting and Zeng, Yi},
  date        = {2024-09-11},
  title       = {Brain-Inspired Stepwise Patch Merging for Vision Transformers},
  doi         = {10.48550/ARXIV.2409.06963},
  eprint      = {2409.06963},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Yu2024a - Brain Inspired Stepwise Patch Merging for Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2409.06963v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@InProceedings{Vo2024,
  author    = {Xuan-Thuy Vo and Duy-Linh Nguyen and Adri Priadana and Kang-Hyun Jo},
  booktitle = {European Conference on Computer Vision (ECCV)},
  title     = {Efficient Vision Transformers with Partial Attention},
  file      = {:Vo2024 - Efficient Vision Transformers with Partial Attention.pdf:PDF},
  priority  = {prio3},
  year      = {2024},
}

@InProceedings{Su2024,
  author    = {Diwei Su and cheng fei and Jianxu Luo},
  booktitle = {European Conference on Computer Vision (ECCV)},
  title     = {Removing Rows and Columns of Tokens in Vision Transformer enables Faster Dense Prediction without Retraining},
  file      = {:Su2024 - Removing Rows and Columns of Tokens in Vision Transformer Enables Faster Dense Prediction without Retraining.pdf:PDF},
  priority  = {prio3},
  year      = {2024},
}

@InProceedings{Zheng2024,
  author    = {Kecheng Zheng and Yifei Zhang and Wei Wu and Fan Lu and Shuailei Ma and Xin Jin and Wei Chen and Yujun Shen},
  booktitle = {European Conference on Computer Vision (ECCV)},
  title     = {Language-Image Pre-training with Long Captions},
  file      = {:Zheng2024 - Language Image Pre Training with Long Captions.pdf:PDF},
  priority  = {prio2},
  year      = {2024},
}

@InProceedings{Li2024b,
  author    = {Lujun Li and Zimian Wei and Peijie Dong and Wenhan Luo and Wei Xue and Qifeng Liu and Yike Guo},
  booktitle = {European Conference on Computer Vision (ECCV)},
  title     = {{AttnZero:} Efficient Attention Discovery for Vision Transformers},
  file      = {:Li2024b - AttnZero_ Efficient Attention Discovery for Vision Transformers.pdf:PDF},
  priority  = {prio3},
  year      = {2024},
}

@Article{Oquab2023,
  author         = {Oquab, Maxime and Darcet, Timothée and Moutakanni, Théo and Vo, Huy and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and El-Nouby, Alaaeldin and Assran, Mahmoud and Ballas, Nicolas and Galuba, Wojciech and Howes, Russell and Huang, Po-Yao and Li, Shang-Wen and Misra, Ishan and Rabbat, Michael and Sharma, Vasu and Synnaeve, Gabriel and Xu, Hu and Jegou, Hervé and Mairal, Julien and Labatut, Patrick and Joulin, Armand and Bojanowski, Piotr},
  date           = {2023-04-14},
  title          = {DINOv2: Learning Robust Visual Features without Supervision},
  doi            = {10.48550/ARXIV.2304.07193},
  eprint         = {2304.07193},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Oquab2023 - DINOv2_ Learning Robust Visual Features without Supervision.pdf:PDF:http\://arxiv.org/pdf/2304.07193v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Zadeh2020,
  author       = {Zadeh, Amir and Liang, Paul Pu and Morency, Louis-Philippe},
  date         = {2020-12},
  journaltitle = {Information Fusion},
  title        = {Foundations of Multimodal Co-learning},
  doi          = {10.1016/j.inffus.2020.06.001},
  issn         = {1566-2535},
  pages        = {188--193},
  volume       = {64},
  file         = {:Zadeh2020 - Foundations of Multimodal Co Learning.pdf:PDF},
  priority     = {prio1},
  publisher    = {Elsevier BV},
}

@Article{Li2024c,
  author      = {Li, Zeyu Michael},
  date        = {2024-10-01},
  title       = {Using Interleaved Ensemble Unlearning to Keep Backdoors at Bay for Finetuning Vision Transformers},
  doi         = {10.48550/ARXIV.2410.01128},
  eprint      = {2410.01128},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Li2024c - Using Interleaved Ensemble Unlearning to Keep Backdoors at Bay for Finetuning Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2410.01128v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Leviathan2024,
  author      = {Leviathan, Yaniv and Kalman, Matan and Matias, Yossi},
  date        = {2024-10-03},
  title       = {Selective Attention Improves Transformer},
  doi         = {10.48550/ARXIV.2410.02703},
  eprint      = {2410.02703},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Leviathan2024 - Selective Attention Improves Transformer.pdf:PDF:http\://arxiv.org/pdf/2410.02703v1},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Rahman2024,
  author      = {Rahman, Md Maklachur and Tutul, Abdullah Aman and Nath, Ankur and Laishram, Lamyanba and Jung, Soon Ki and Hammond, Tracy},
  date        = {2024-10-04},
  title       = {Mamba in Vision: A Comprehensive Survey of Techniques and Applications},
  doi         = {10.48550/ARXIV.2410.03105},
  eprint      = {2410.03105},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {Mamba is emerging as a novel approach to overcome the challenges faced by Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) in computer vision. While CNNs excel at extracting local features, they often struggle to capture long-range dependencies without complex architectural modifications. In contrast, ViTs effectively model global relationships but suffer from high computational costs due to the quadratic complexity of their self-attention mechanisms. Mamba addresses these limitations by leveraging Selective Structured State Space Models to effectively capture long-range dependencies with linear computational complexity. This survey analyzes the unique contributions, computational benefits, and applications of Mamba models while also identifying challenges and potential future research directions. We provide a foundational resource for advancing the understanding and growth of Mamba models in computer vision. An overview of this work is available at https://github.com/maklachur/Mamba-in-Computer-Vision.},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Rahman2024 - Mamba in Vision_ a Comprehensive Survey of Techniques and Applications.pdf:PDF:http\://arxiv.org/pdf/2410.03105v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Liu2024,
  author      = {Liu, Ziming and Wang, Yixuan and Vaidya, Sachin and Ruehle, Fabian and Halverson, James and Soljačić, Marin and Hou, Thomas Y. and Tegmark, Max},
  date        = {2024-04-30},
  title       = {KAN: Kolmogorov-Arnold Networks},
  doi         = {10.48550/ARXIV.2404.19756},
  eprint      = {2404.19756},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Liu2024 - KAN_ Kolmogorov Arnold Networks.pdf:PDF:http\://arxiv.org/pdf/2404.19756v4},
  keywords    = {Machine Learning (cs.LG), Disordered Systems and Neural Networks (cond-mat.dis-nn), Artificial Intelligence (cs.AI), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Physical sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Feng2024,
  author      = {Feng, Leo and Tung, Frederick and Ahmed, Mohamed Osama and Bengio, Yoshua and Hajimirsadegh, Hossein},
  date        = {2024-10-02},
  title       = {Were RNNs All We Needed?},
  doi         = {10.48550/ARXIV.2410.01201},
  eprint      = {2410.01201},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Feng2024 - Were RNNs All We Needed_.pdf:PDF:http\://arxiv.org/pdf/2410.01201v2},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Yun2021,
  author         = {Yun, Sangdoo and Oh, Seong Joon and Heo, Byeongho and Han, Dongyoon and Choe, Junsuk and Chun, Sanghyuk},
  date           = {2021-01-13},
  title          = {Re-labeling ImageNet: from Single to Multi-Labels, from Global to Localized Labels},
  doi            = {10.48550/ARXIV.2101.05022},
  eprint         = {2101.05022},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Share Alike 4.0 International},
  file           = {:Yun2021 - Re Labeling ImageNet_ from Single to Multi Labels, from Global to Localized Labels.pdf:PDF:http\://arxiv.org/pdf/2101.05022v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Wang2024e,
  author      = {Wang, Junxuan and Ge, Xuyang and Shu, Wentao and Tang, Qiong and Zhou, Yunhua and He, Zhengfu and Qiu, Xipeng},
  date        = {2024-10-09},
  title       = {Towards Universality: Studying Mechanistic Similarity Across Language Model Architectures},
  doi         = {10.48550/ARXIV.2410.06672},
  eprint      = {2410.06672},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Share Alike 4.0 International},
  file        = {:Wang2024e - Towards Universality_ Studying Mechanistic Similarity across Language Model Architectures.pdf:PDF:http\://arxiv.org/pdf/2410.06672v2},
  keywords    = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Gavrikov2024,
  author      = {Gavrikov, Paul and Agnihotri, Shashank and Keuper, Margret and Keuper, Janis},
  date        = {2024-10-18},
  title       = {How Do Training Methods Influence the Utilization of Vision Models?},
  doi         = {10.48550/ARXIV.2410.14470},
  eprint      = {2410.14470},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Gavrikov2024 - How Do Training Methods Influence the Utilization of Vision Models_.pdf:PDF:http\://arxiv.org/pdf/2410.14470v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Assran2023,
  author      = {Assran, Mahmoud and Duval, Quentin and Misra, Ishan and Bojanowski, Piotr and Vincent, Pascal and Rabbat, Michael and LeCun, Yann and Ballas, Nicolas},
  date        = {2023-01-19},
  title       = {Self-Supervised Learning from Images with a Joint-Embedding Predictive Architecture},
  doi         = {10.48550/ARXIV.2301.08243},
  eprint      = {2301.08243},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Assran2023 - Self Supervised Learning from Images with a Joint Embedding Predictive Architecture.pdf:PDF},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Kaul2024,
  author      = {Kaul, Prannay and Ma, Chengcheng and Elezi, Ismail and Deng, Jiankang},
  date        = {2024-10-22},
  title       = {From Attention to Activation: Unravelling the Enigmas of Large Language Models},
  doi         = {10.48550/ARXIV.2410.17174},
  eprint      = {2410.17174},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Kaul2024 - From Attention to Activation_ Unravelling the Enigmas of Large Language Models.pdf:PDF:http\://arxiv.org/pdf/2410.17174v1},
  keywords    = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Saratchandran2024,
  author      = {Saratchandran, Hemanth and Zheng, Jianqiao and Ji, Yiping and Zhang, Wenbo and Lucey, Simon},
  date        = {2024-10-24},
  title       = {Rethinking Softmax: Self-Attention with Polynomial Activations},
  doi         = {10.48550/ARXIV.2410.18613},
  eprint      = {2410.18613},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Saratchandran2024 - Rethinking Softmax_ Self Attention with Polynomial Activations.pdf:PDF:http\://arxiv.org/pdf/2410.18613v1},
  keywords    = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (stat.ML), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Docherty2024,
  author      = {Docherty, Ronan and Vamvakeros, Antonis and Cooper, Samuel J.},
  date        = {2024-10-20},
  title       = {Upsampling DINOv2 features for unsupervised vision tasks and weakly supervised materials segmentation},
  doi         = {10.48550/ARXIV.2410.19836},
  eprint      = {2410.19836},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Docherty2024 - Upsampling DINOv2 Features for Unsupervised Vision Tasks and Weakly Supervised Materials Segmentation.pdf:PDF:http\://arxiv.org/pdf/2410.19836v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Materials Science (cond-mat.mtrl-sci), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Physical sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@InProceedings{Trabucco2024,
  author     = {Brandon Trabucco and Kyle Doherty and Max A Gurinas and Ruslan Salakhutdinov},
  booktitle  = {The Twelfth International Conference on Learning Representations},
  title      = {Effective Data Augmentation With Diffusion Models},
  url        = {https://openreview.net/forum?id=ZWzUA9zeAg},
  file       = {:Trabucco2024 - Effective Data Augmentation with Diffusion Models.pdf:PDF},
  readstatus = {read},
  year       = {2024},
}

@Article{Xiao2020,
  author      = {Xiao, Kai and Engstrom, Logan and Ilyas, Andrew and Madry, Aleksander},
  date        = {2020-06-17},
  title       = {Noise or Signal: The Role of Image Backgrounds in Object Recognition},
  doi         = {10.48550/ARXIV.2006.09994},
  eprint      = {2006.09994},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Xiao2020 - Noise or Signal_ the Role of Image Backgrounds in Object Recognition.pdf:PDF:http\://arxiv.org/pdf/2006.09994v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  readstatus  = {read},
  year        = {2020},
}

@Article{Liang2023,
  author      = {Liang, Junhui and Liu, Ying and Vlassov, Vladimir},
  date        = {2023-08-18},
  title       = {The Impact of Background Removal on Performance of Neural Networks for Fashion Image Classification and Segmentation},
  doi         = {10.1109/csce60160.2023.00323},
  eprint      = {2308.09764},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  pages       = {1960--1968},
  booktitle   = {2023 Congress in Computer Science, Computer Engineering, &amp;amp; Applied Computing (CSCE)},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Liang2023 - The Impact of Background Removal on Performance of Neural Networks for Fashion Image Classification and Segmentation.pdf:PDF:http\://arxiv.org/pdf/2308.09764v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  month       = jul,
  priority    = {prio1},
  publisher   = {IEEE},
  year        = {2023},
}

@Article{Akhmedova2024,
  author      = {Akhmedova, Shakhnaz and Körber, Nils},
  date        = {2024-04-19},
  title       = {Next Generation Loss Function for Image Classification},
  doi         = {10.48550/ARXIV.2404.12948},
  eprint      = {2404.12948},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Akhmedova2024 - Next Generation Loss Function for Image Classification.pdf:PDF:http\://arxiv.org/pdf/2404.12948v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences},
  publisher   = {arXiv},
  readstatus  = {read},
  year        = {2024},
}

@Article{Gonzalez2019,
  author       = {Gonzalez, Santiago and Miikkulainen, Risto},
  date         = {2019-05-27},
  journaltitle = {Proceedings of the 2020 IEEE Congress on Evolutionary Computation},
  title        = {Improved Training Speed, Accuracy, and Data Utilization Through Loss Function Optimization},
  doi          = {10.48550/ARXIV.1905.11528},
  eprint       = {1905.11528},
  eprintclass  = {cs.LG},
  eprinttype   = {arXiv},
  copyright    = {arXiv.org perpetual, non-exclusive license},
  file         = {:Gonzalez2019 - Improved Training Speed, Accuracy, and Data Utilization through Loss Function Optimization.pdf:PDF:http\://arxiv.org/pdf/1905.11528v3},
  keywords     = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), Neural and Evolutionary Computing (cs.NE), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher    = {arXiv},
  readstatus   = {read},
  year         = {2019},
}

@Misc{Islam2021,
  author   = {Md Amirul Islam and Matthew Kowal and Sen Jia and Konstantinos G. Derpanis and Neil Bruce},
  title    = {Boundary Effects in {\{}CNN{\}}s: Feature or Bug?},
  url      = {https://openreview.net/forum?id=M4qXqdw3xC},
  file     = {:Islam2021 - Boundary Effects in CNN_s_ Feature or Bug_.pdf:PDF},
  priority = {prio1},
  year     = {2021},
}

@Article{Islam2020,
  author      = {Islam, Md Amirul and Jia, Sen and Bruce, Neil D. B.},
  date        = {2020-01-22},
  title       = {How Much Position Information Do Convolutional Neural Networks Encode?},
  doi         = {10.48550/ARXIV.2001.08248},
  eprint      = {2001.08248},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Islam2020 - How Much Position Information Do Convolutional Neural Networks Encode_.pdf:PDF:http\://arxiv.org/pdf/2001.08248v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2020},
}

@InProceedings{Islam2022a,
  author    = {Islam, Md Amirul and Kowal, Matthew and Esser, Patrick and Ommer, Bj{\"o}rn and Derpanis, Konstantinos G and Bruce, Neil DB and Runway, ML},
  booktitle = {BMVC},
  title     = {Maximizing Mutual Shape Information.},
  pages     = {909},
  file      = {:Islam2022a - Maximizing Mutual Shape Information..pdf:PDF},
  priority  = {prio1},
  year      = {2022},
}

@Article{Zhang2024d,
  author      = {Zhang, Tianyi and Li, Baoxin and Seo, Jae-sun and Cao, Yu},
  date        = {2024-10-31},
  title       = {Context-Aware Token Selection and Packing for Enhanced Vision Transformer},
  doi         = {10.48550/ARXIV.2410.23608},
  eprint      = {2410.23608},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Zhang2024d - Context Aware Token Selection and Packing for Enhanced Vision Transformer.pdf:PDF:http\://arxiv.org/pdf/2410.23608v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Kobayashi2024,
  author      = {Kobayashi, Seijin and Akram, Yassir and Von Oswald, Johannes},
  date        = {2024-10-31},
  title       = {Weight decay induces low-rank attention layers},
  doi         = {10.48550/ARXIV.2410.23819},
  eprint      = {2410.23819},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Kobayashi2024 - Weight Decay Induces Low Rank Attention Layers.pdf:PDF:http\://arxiv.org/pdf/2410.23819v1},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Arya2024,
  author      = {Arya, Shreyash and Rao, Sukrut and Böhle, Moritz and Schiele, Bernt},
  date        = {2024-11-01},
  title       = {B-cosification: Transforming Deep Neural Networks to be Inherently Interpretable},
  doi         = {10.48550/ARXIV.2411.00715},
  eprint      = {2411.00715},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Arya2024 - B Cosification_ Transforming Deep Neural Networks to Be Inherently Interpretable.pdf:PDF:http\://arxiv.org/pdf/2411.00715v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Luo2024,
  author      = {Luo, Xiangzhong and Liu, Di and Kong, Hao and Huai, Shuo and Chen, Hui and Xiong, Guochu and Liu, Weichen},
  date        = {2024-11-03},
  title       = {Efficient Deep Learning Infrastructures for Embedded Computing Systems: A Comprehensive Survey and Future Envision},
  doi         = {10.48550/ARXIV.2411.01431},
  eprint      = {2411.01431},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:http\://arxiv.org/pdf/2411.01431v1:PDF},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Huang2024a,
  author      = {Huang, Zilong and Ye, Qinghao and Kang, Bingyi and Feng, Jiashi and Fan, Haoqi},
  date        = {2024-11-05},
  title       = {Classification Done Right for Vision-Language Pre-Training},
  doi         = {10.48550/ARXIV.2411.03313},
  eprint      = {2411.03313},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Huang2024a - Classification Done Right for Vision Language Pre Training.pdf:PDF:http\://arxiv.org/pdf/2411.03313v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Li2024d,
  author      = {Li, Kevin Y. and Goyal, Sachin and Semedo, Joao D. and Kolter, J. Zico},
  date        = {2024-11-05},
  title       = {Inference Optimal VLMs Need Only One Visual Token but Larger Models},
  doi         = {10.48550/ARXIV.2411.03312},
  eprint      = {2411.03312},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Li2024d - Inference Optimal VLMs Need Only One Visual Token but Larger Models.pdf:PDF:http\://arxiv.org/pdf/2411.03312v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Ye2024,
  author      = {Ye, Wenqian and Zheng, Guangtao and Cao, Xu and Ma, Yunsheng and Zhang, Aidong},
  date        = {2024-02-20},
  title       = {Spurious Correlations in Machine Learning: A Survey},
  doi         = {10.48550/ARXIV.2402.12715},
  eprint      = {2402.12715},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Ye2024 - Spurious Correlations in Machine Learning_ a Survey.pdf:PDF:http\://arxiv.org/pdf/2402.12715v2},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  readstatus  = {read},
  year        = {2024},
}

@Article{Ponkshe2024,
  author      = {Ponkshe, Kaustubh and Singhal, Raghav and Gorbunov, Eduard and Tumanov, Alexey and Horvath, Samuel and Vepakomma, Praneeth},
  date        = {2024-11-29},
  title       = {Initialization using Update Approximation is a Silver Bullet for Extremely Efficient Low-Rank Fine-Tuning},
  doi         = {10.48550/ARXIV.2411.19557},
  eprint      = {2411.19557},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Ponkshe2024 - Initialization Using Update Approximation Is a Silver Bullet for Extremely Efficient Low Rank Fine Tuning.pdf:PDF:http\://arxiv.org/pdf/2411.19557v1},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Halbe2024,
  author      = {Halbe, Shaunak and Tian, Junjiao and Joseph, K J and Smith, James Seale and Stevo, Katherine and Balasubramanian, Vineeth N and Kira, Zsolt},
  date        = {2024-12-05},
  title       = {Grounding Descriptions in Images informs Zero-Shot Visual Recognition},
  doi         = {10.48550/ARXIV.2412.04429},
  eprint      = {2412.04429},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Halbe2024 - Grounding Descriptions in Images Informs Zero Shot Visual Recognition.pdf:PDF:http\://arxiv.org/pdf/2412.04429v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Lin2024,
  author      = {Lin, Zhenghao and Gou, Zhibin and Gong, Yeyun and Liu, Xiao and Shen, Yelong and Xu, Ruochen and Lin, Chen and Yang, Yujiu and Jiao, Jian and Duan, Nan and Chen, Weizhu},
  date        = {2024-04-11},
  title       = {Rho-1: Not All Tokens Are What You Need},
  doi         = {10.48550/ARXIV.2404.07965},
  eprint      = {2404.07965},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Lin2024 - Rho 1_ Not All Tokens Are What You Need.pdf:PDF:http\://arxiv.org/pdf/2404.07965v3},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Zhang2024e,
  author      = {Zhang, Le and Yang, Qian and Agrawal, Aishwarya},
  date        = {2024-12-05},
  title       = {Assessing and Learning Alignment of Unimodal Vision and Language Models},
  doi         = {10.48550/ARXIV.2412.04616},
  eprint      = {2412.04616},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Zhang2024e - Assessing and Learning Alignment of Unimodal Vision and Language Models.pdf:PDF:http\://arxiv.org/pdf/2412.04616v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Lew2024,
  author      = {Lew, Jaihyun and Jang, Soohyuk and Lee, Jaehoon and Yoo, Seungryong and Kim, Eunji and Lee, Saehyung and Mok, Jisoo and Kim, Siwon and Yoon, Sungroh},
  date        = {2024-12-06},
  title       = {Superpixel Tokenization for Vision Transformers: Preserving Semantic Integrity in Visual Tokens},
  doi         = {10.48550/ARXIV.2412.04680},
  eprint      = {2412.04680},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Lew2024 - Superpixel Tokenization for Vision Transformers_ Preserving Semantic Integrity in Visual Tokens.pdf:PDF:http\://arxiv.org/pdf/2412.04680v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Li2024e,
  author      = {Li, Alexander C. and Tian, Yuandong and Chen, Beidi and Pathak, Deepak and Chen, Xinlei},
  date        = {2024-11-14},
  title       = {On the Surprising Effectiveness of Attention Transfer for Vision Transformers},
  doi         = {10.48550/ARXIV.2411.09702},
  eprint      = {2411.09702},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Li2024e - On the Surprising Effectiveness of Attention Transfer for Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2411.09702v1},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Xu2024a,
  author      = {Xu, Minghao and Xiang, Lichuan and Cai, Xu and Wen, Hongkai},
  date        = {2024-12-16},
  title       = {No More Adam: Learning Rate Scaling at Initialization is All You Need},
  doi         = {10.48550/ARXIV.2412.11768},
  eprint      = {2412.11768},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Xu2024a - No More Adam_ Learning Rate Scaling at Initialization Is All You Need.pdf:PDF:http\://arxiv.org/pdf/2412.11768v1},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@InProceedings{GontijoLopes2021,
  author     = {Raphael Gontijo-Lopes and Sylvia Smullin and Ekin Dogus Cubuk and Ethan Dyer},
  booktitle  = {International Conference on Learning Representations},
  title      = {Tradeoffs in Data Augmentation: An Empirical Study},
  url        = {https://openreview.net/forum?id=ZcKPWuhG6wy},
  file       = {:/home/tnauen/cloud/JobDFKI/Papers/GontijoLopes2021 - Tradeoffs in Data Augmentation_ an Empirical Study.pdf:PDF},
  readstatus = {read},
  year       = {2021},
}

@Article{Guo2024a,
  author      = {Guo, Qiushan and De Mello, Shalini and Yin, Hongxu and Byeon, Wonmin and Cheung, Ka Chun and Yu, Yizhou and Luo, Ping and Liu, Sifei},
  date        = {2024-03-04},
  title       = {RegionGPT: Towards Region Understanding Vision Language Model},
  doi         = {10.48550/ARXIV.2403.02330},
  eprint      = {2403.02330},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Guo2024a - RegionGPT_ Towards Region Understanding Vision Language Model.pdf:PDF:http\://arxiv.org/pdf/2403.02330v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Liu2023c,
  author      = {Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae},
  date        = {2023-04-17},
  title       = {Visual Instruction Tuning},
  doi         = {10.48550/ARXIV.2304.08485},
  eprint      = {2304.08485},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Liu2023c - Visual Instruction Tuning.pdf:PDF:http\://arxiv.org/pdf/2304.08485v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Liu2023d,
  author      = {Liu, Haotian and Li, Chunyuan and Li, Yuheng and Lee, Yong Jae},
  date        = {2023-10-05},
  title       = {Improved Baselines with Visual Instruction Tuning},
  doi         = {10.48550/ARXIV.2310.03744},
  eprint      = {2310.03744},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Liu2023d - Improved Baselines with Visual Instruction Tuning.pdf:PDF:http\://arxiv.org/pdf/2310.03744v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Li2024f,
  author      = {Li, Zhiyuan and Xia, Tingyu and Chang, Yi and Wu, Yuan},
  date        = {2024-12-19},
  title       = {A Survey of RWKV},
  doi         = {10.48550/ARXIV.2412.14847},
  eprint      = {2412.14847},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Li2024f - A Survey of RWKV.pdf:PDF:http\://arxiv.org/pdf/2412.14847v2},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Jose2024,
  author      = {Jose, Cijo and Moutakanni, Théo and Kang, Dahyun and Baldassarre, Federico and Darcet, Timothée and Xu, Hu and Li, Daniel and Szafraniec, Marc and Ramamonjisoa, Michaël and Oquab, Maxime and Siméoni, Oriane and Vo, Huy V. and Labatut, Patrick and Bojanowski, Piotr},
  date        = {2024-12-20},
  title       = {DINOv2 Meets Text: A Unified Framework for Image- and Pixel-Level Vision-Language Alignment},
  doi         = {10.48550/ARXIV.2412.16334},
  eprint      = {2412.16334},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Jose2024 - DINOv2 Meets Text_ a Unified Framework for Image and Pixel Level Vision Language Alignment.pdf:PDF:http\://arxiv.org/pdf/2412.16334v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Yoa2024,
  author      = {Yoa, Seungdong and Lee, Seungjun and Cho, Hyeseung and Kim, Bumsoo and Lim, Woohyung},
  date        = {2024-12-21},
  title       = {ImagePiece: Content-aware Re-tokenization for Efficient Image Recognition},
  doi         = {10.48550/ARXIV.2412.16491},
  eprint      = {2412.16491},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Yoa2024 - ImagePiece_ Content Aware Re Tokenization for Efficient Image Recognition.pdf:PDF:http\://arxiv.org/pdf/2412.16491v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Oko2025,
  author      = {Oko, Kazusato and Lin, Licong and Cai, Yuhang and Mei, Song},
  date        = {2025-01-08},
  title       = {A Statistical Theory of Contrastive Pre-training and Multimodal Generative AI},
  doi         = {10.48550/ARXIV.2501.04641},
  eprint      = {2501.04641},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Oko2025 - A Statistical Theory of Contrastive Pre Training and Multimodal Generative AI.pdf:PDF:http\://arxiv.org/pdf/2501.04641v1},
  groups      = {Reading Group Potential},
  keywords    = {Machine Learning (cs.LG), Statistics Theory (math.ST), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Mathematics},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Zheng2025,
  author      = {Zheng, Chuanyang},
  date        = {2025-01-27},
  title       = {The Linear Attention Resurrection in Vision Transformer},
  doi         = {10.48550/ARXIV.2501.16182},
  eprint      = {2501.16182},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Zheng2025 - The Linear Attention Resurrection in Vision Transformer.pdf:PDF:http\://arxiv.org/pdf/2501.16182v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Dwibedi2017,
  author         = {Dwibedi, Debidatta and Misra, Ishan and Hebert, Martial},
  date           = {2017-08-04},
  title          = {Cut, Paste and Learn: Surprisingly Easy Synthesis for Instance Detection},
  doi            = {10.48550/ARXIV.1708.01642},
  eprint         = {1708.01642},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Dwibedi2017 - Cut, Paste and Learn_ Surprisingly Easy Synthesis for Instance Detection.pdf:PDF:http\://arxiv.org/pdf/1708.01642v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2017},
}

@InProceedings{Hinterstoisser2019,
  author         = {Hinterstoisser, Stefan and Pauly, Olivier and Heibel, Hauke and Martina, Marek and Bokeloh, Martin},
  booktitle      = {2019 IEEE/CVF International Conference on Computer Vision Workshop (ICCVW)},
  title          = {An Annotation Saved is an Annotation Earned: Using Fully Synthetic Training for Object Detection},
  doi            = {10.1109/ICCVW.2019.00340},
  pages          = {2787-2796},
  file           = {:Hinterstoisser2019 - An Annotation Saved Is an Annotation Earned_ Using Fully Synthetic Training for Object Detection.pdf:PDF},
  keywords       = {Computer vision;Conferences;Synthetic Data;Object Detection;Deep Learning},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2019},
}

@Article{Ge2023,
  author         = {Yunhao Ge and Jiashu Xu and Brian Nlong Zhao and Neel Joshi and Laurent Itti and Vibhav Vineet},
  title          = {Beyond Generation: Harnessing Text to Image Models for Object Detection and Segmentation},
  url            = {https://api.semanticscholar.org/CorpusID:261697353},
  volume         = {abs/2309.05956},
  file           = {:Ge2023 - Beyond Generation_ Harnessing Text to Image Models for Object Detection and Segmentation.pdf:PDF},
  journal        = {ArXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Werman2021,
  author         = {Werman, Levi Kassel Michael},
  date           = {2021-12-20},
  title          = {DeePaste -- Inpainting for Pasting},
  doi            = {10.48550/ARXIV.2112.10600},
  eprint         = {2112.10600},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Werman2021 - DeePaste Inpainting for Pasting.pdf:PDF:http\://arxiv.org/pdf/2112.10600v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Ghiasi2020,
  author         = {Ghiasi, Golnaz and Cui, Yin and Srinivas, Aravind and Qian, Rui and Lin, Tsung-Yi and Cubuk, Ekin D. and Le, Quoc V. and Zoph, Barret},
  date           = {2020-12-13},
  title          = {Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation},
  doi            = {10.48550/ARXIV.2012.07177},
  eprint         = {2012.07177},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Ghiasi2020 - Simple Copy Paste Is a Strong Data Augmentation Method for Instance Segmentation.pdf:PDF:http\://arxiv.org/pdf/2012.07177v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2020},
}

@Article{Cubuk2019,
  author      = {Cubuk, Ekin D. and Zoph, Barret and Shlens, Jonathon and Le, Quoc V.},
  date        = {2019-09-30},
  title       = {RandAugment: Practical automated data augmentation with a reduced search space},
  doi         = {10.48550/ARXIV.1909.13719},
  eprint      = {1909.13719},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Cubuk2019 - RandAugment_ Practical Automated Data Augmentation with a Reduced Search Space.pdf:PDF:http\://arxiv.org/pdf/1909.13719v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2019},
}

@Article{Griffin2024,
  author         = {Griffin, Brent A. and Marks, Jacob and Corso, Jason J.},
  date           = {2024-11-22},
  title          = {Zero-Shot Coreset Selection: Efficient Pruning for Unlabeled Data},
  doi            = {10.48550/ARXIV.2411.15349},
  eprint         = {2411.15349},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Griffin2024 - Zero Shot Coreset Selection_ Efficient Pruning for Unlabeled Data.pdf:PDF:http\://arxiv.org/pdf/2411.15349v1},
  groups         = {Coreset for FL},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Guo2022,
  author         = {Guo, Chengcheng and Zhao, Bo and Bai, Yanbing},
  date           = {2022-04-18},
  title          = {DeepCore: A Comprehensive Library for Coreset Selection in Deep Learning},
  doi            = {10.48550/ARXIV.2204.08499},
  eprint         = {2204.08499},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file           = {:Guo2022 - DeepCore_ a Comprehensive Library for Coreset Selection in Deep Learning.pdf:PDF:http\://arxiv.org/pdf/2204.08499v3},
  groups         = {Coreset for FL},
  keywords       = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2022},
}

@InProceedings{Yang2024c,
  author         = {Yang, Shuo and Cao, Zhe and Guo, Sheng and Zhang, Ruiheng and Luo, Ping and Zhang, Shengping and Nie, Liqiang},
  booktitle      = {Proceedings of the 41st International Conference on Machine Learning},
  title          = {Mind the Boundary: Coreset Selection via Reconstructing the Decision Boundary},
  editor         = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},
  pages          = {55948--55960},
  publisher      = {PMLR},
  series         = {Proceedings of Machine Learning Research},
  url            = {https://proceedings.mlr.press/v235/yang24b.html},
  volume         = {235},
  file           = {:Yang2024c - Mind the Boundary_ Coreset Selection Via Reconstructing the Decision Boundary.pdf:PDF},
  groups         = {Coreset for FL},
  month          = {21--27 Jul},
  pdf            = {https://raw.githubusercontent.com/mlresearch/v235/main/assets/yang24b/yang24b.pdf},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Killamsetty2021,
  author         = {Killamsetty, Krishnateja and Zhao, Xujiang and Chen, Feng and Iyer, Rishabh},
  date           = {2021-06-14},
  title          = {RETRIEVE: Coreset Selection for Efficient and Robust Semi-Supervised Learning},
  doi            = {10.48550/ARXIV.2106.07760},
  eprint         = {2106.07760},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file           = {:Killamsetty2021 - RETRIEVE_ Coreset Selection for Efficient and Robust Semi Supervised Learning.pdf:PDF:http\://arxiv.org/pdf/2106.07760v2},
  groups         = {Coreset for FL},
  keywords       = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2021},
}

@Article{Huang2023,
  author         = {Huang, Xijie and Liu, Zechun and Liu, Shih-Yang and Cheng, Kwang-Ting},
  date           = {2023-06-12},
  title          = {Efficient and Robust Quantization-aware Training via Adaptive Coreset Selection},
  doi            = {10.48550/ARXIV.2306.07215},
  eprint         = {2306.07215},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file           = {:Huang2023 - Efficient and Robust Quantization Aware Training Via Adaptive Coreset Selection.pdf:PDF:http\://arxiv.org/pdf/2306.07215v3},
  groups         = {Coreset for FL},
  keywords       = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Cubuk2018,
  author      = {Cubuk, Ekin D. and Zoph, Barret and Mane, Dandelion and Vasudevan, Vijay and Le, Quoc V.},
  date        = {2018-05-24},
  title       = {AutoAugment: Learning Augmentation Policies from Data},
  doi         = {10.48550/ARXIV.1805.09501},
  eprint      = {1805.09501},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Cubuk2018 - AutoAugment_ Learning Augmentation Policies from Data.pdf:PDF:http\://arxiv.org/pdf/1805.09501v3},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2018},
}

@Article{Takahashi2018,
  author       = {Takahashi, Ryo and Matsubara, Takashi and Uehara, Kuniaki},
  date         = {2018-11-22},
  journaltitle = {IEEE Transactions on Circuits and Systems for Video Technology, 2019},
  title        = {Data Augmentation using Random Image Cropping and Patching for Deep CNNs},
  doi          = {10.1109/tcsvt.2019.2935128},
  eprint       = {1811.09030},
  eprintclass  = {cs.CV},
  eprinttype   = {arXiv},
  issn         = {1558-2205},
  number       = {9},
  pages        = {2917--2931},
  volume       = {30},
  copyright    = {arXiv.org perpetual, non-exclusive license},
  file         = {:Takahashi2018 - Data Augmentation Using Random Image Cropping and Patching for Deep CNNs.pdf:PDF:http\://arxiv.org/pdf/1811.09030v2},
  keywords     = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  month        = sep,
  publisher    = {Institute of Electrical and Electronics Engineers (IEEE)},
  year         = {2018},
}

@Article{Zhong2017,
  author      = {Zhong, Zhun and Zheng, Liang and Kang, Guoliang and Li, Shaozi and Yang, Yi},
  date        = {2017-08-16},
  title       = {Random Erasing Data Augmentation},
  doi         = {10.48550/ARXIV.1708.04896},
  eprint      = {1708.04896},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhong2017 - Random Erasing Data Augmentation.pdf:PDF:http\://arxiv.org/pdf/1708.04896v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2017},
}

@Article{Shorten2019,
  author       = {Shorten, Connor and Khoshgoftaar, Taghi M.},
  date         = {2019-07},
  journaltitle = {Journal of Big Data},
  title        = {A survey on Image Data Augmentation for Deep Learning},
  doi          = {10.1186/s40537-019-0197-0},
  issn         = {2196-1115},
  number       = {1},
  volume       = {6},
  file         = {:Shorten2019 - A Survey on Image Data Augmentation for Deep Learning.pdf:PDF},
  publisher    = {Springer Science and Business Media LLC},
  year         = {2019},
}

@Article{Xu2023d,
  author       = {Xu, Mingle and Yoon, Sook and Fuentes, Alvaro and Park, Dong Sun},
  date         = {2023-05},
  journaltitle = {Pattern Recognition},
  title        = {A Comprehensive Survey of Image Augmentation Techniques for Deep Learning},
  doi          = {10.1016/j.patcog.2023.109347},
  issn         = {0031-3203},
  pages        = {109347},
  volume       = {137},
  file         = {:Xu2023d - A Comprehensive Survey of Image Augmentation Techniques for Deep Learning.pdf:PDF},
  publisher    = {Elsevier BV},
  year         = {2023},
}

@Article{Ling2022,
  author      = {Ling, Evan and Huang, Dezhao and Hur, Minhoe},
  date        = {2022-10-07},
  title       = {Humans need not label more humans: Occlusion Copy \& Paste for Occluded Human Instance Segmentation},
  doi         = {10.48550/ARXIV.2210.03686},
  eprint      = {2210.03686},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Ling2022 - Humans Need Not Label More Humans_ Occlusion Copy & Paste for Occluded Human Instance Segmentation.pdf:PDF:http\://arxiv.org/pdf/2210.03686v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@Article{Hendrycks2019,
  author      = {Hendrycks, Dan and Dietterich, Thomas},
  date        = {2019-03-28},
  title       = {Benchmarking Neural Network Robustness to Common Corruptions and Perturbations},
  doi         = {10.48550/ARXIV.1903.12261},
  eprint      = {1903.12261},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Hendrycks2019 - Benchmarking Neural Network Robustness to Common Corruptions and Perturbations.pdf:PDF:http\://arxiv.org/pdf/1903.12261v1},
  keywords    = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (stat.ML), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2019},
}

@Article{Li2023e,
  author       = {Li, Xiaodan and Chen, Yuefeng and Zhu, Yao and Wang, Shuhui and Zhang, Rong and Xue, Hui},
  date         = {2023-03-30},
  journaltitle = {CVPR 2023},
  title        = {ImageNet-E: Benchmarking Neural Network Robustness via Attribute Editing},
  doi          = {10.48550/ARXIV.2303.17096},
  eprint       = {2303.17096},
  eprintclass  = {cs.CV},
  eprinttype   = {arXiv},
  copyright    = {Creative Commons Attribution 4.0 International},
  file         = {:Li2023e - ImageNet E_ Benchmarking Neural Network Robustness Via Attribute Editing.pdf:PDF:http\://arxiv.org/pdf/2303.17096v1},
  keywords     = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority     = {prio1},
  publisher    = {arXiv},
  year         = {2023},
}

@Article{Zhang2024f,
  author      = {Zhang, Chenshuang and Pan, Fei and Kim, Junmo and Kweon, In So and Mao, Chengzhi},
  date        = {2024-03-27},
  title       = {ImageNet-D: Benchmarking Neural Network Robustness on Diffusion Synthetic Object},
  doi         = {10.48550/ARXIV.2403.18775},
  eprint      = {2403.18775},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhang2024f - ImageNet D_ Benchmarking Neural Network Robustness on Diffusion Synthetic Object.pdf:PDF:http\://arxiv.org/pdf/2403.18775v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Geirhos2018,
  author      = {Geirhos, Robert and Rubisch, Patricia and Michaelis, Claudio and Bethge, Matthias and Wichmann, Felix A. and Brendel, Wieland},
  date        = {2018-11-29},
  title       = {ImageNet-trained CNNs are biased towards texture; increasing shape bias improves accuracy and robustness},
  doi         = {10.48550/ARXIV.1811.12231},
  eprint      = {1811.12231},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Geirhos2018 - ImageNet Trained CNNs Are Biased Towards Texture\; Increasing Shape Bias Improves Accuracy and Robustness.pdf:PDF:http\://arxiv.org/pdf/1811.12231v3},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Neurons and Cognition (q-bio.NC), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Biological sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2018},
}

@Article{Suvorov2021,
  author      = {Suvorov, Roman and Logacheva, Elizaveta and Mashikhin, Anton and Remizova, Anastasia and Ashukha, Arsenii and Silvestrov, Aleksei and Kong, Naejin and Goka, Harshith and Park, Kiwoong and Lempitsky, Victor},
  date        = {2021-09-15},
  title       = {Resolution-robust Large Mask Inpainting with Fourier Convolutions},
  doi         = {10.48550/ARXIV.2109.07161},
  eprint      = {2109.07161},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Suvorov2021 - Resolution Robust Large Mask Inpainting with Fourier Convolutions.pdf:PDF:http\://arxiv.org/pdf/2109.07161v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  publisher   = {arXiv},
  year        = {2021},
}

@Article{Sun2024,
  author      = {Sun, Wenhao and Cui, Benlei and Dong, Xue-Mei and Tang, Jingqun},
  date        = {2024-12-17},
  title       = {Attentive Eraser: Unleashing Diffusion Model's Object Removal Potential via Self-Attention Redirection Guidance},
  doi         = {10.48550/ARXIV.2412.12974},
  eprint      = {2412.12974},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Sun2024 - Attentive Eraser_ Unleashing Diffusion Model's Object Removal Potential Via Self Attention Redirection Guidance.pdf:PDF:http\://arxiv.org/pdf/2412.12974v3},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Ren2024,
  author      = {Ren, Tianhe and Liu, Shilong and Zeng, Ailing and Lin, Jing and Li, Kunchang and Cao, He and Chen, Jiayu and Huang, Xinyu and Chen, Yukang and Yan, Feng and Zeng, Zhaoyang and Zhang, Hao and Li, Feng and Yang, Jie and Li, Hongyang and Jiang, Qing and Zhang, Lei},
  date        = {2024-01-25},
  title       = {Grounded SAM: Assembling Open-World Models for Diverse Visual Tasks},
  doi         = {10.48550/ARXIV.2401.14159},
  eprint      = {2401.14159},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Ren2024 - Grounded SAM_ Assembling Open World Models for Diverse Visual Tasks.pdf:PDF:http\://arxiv.org/pdf/2401.14159v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Liu2023e,
  author         = {Liu, Shilong and Zeng, Zhaoyang and Ren, Tianhe and Li, Feng and Zhang, Hao and Yang, Jie and Jiang, Qing and Li, Chunyuan and Yang, Jianwei and Su, Hang and Zhu, Jun and Zhang, Lei},
  date           = {2023-03-09},
  title          = {Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection},
  doi            = {10.48550/ARXIV.2303.05499},
  eprint         = {2303.05499},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Liu2023e - Grounding DINO_ Marrying DINO with Grounded Pre Training for Open Set Object Detection.pdf:PDF:http\://arxiv.org/pdf/2303.05499v5},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2023},
}

@Article{Kirillov2023,
  author      = {Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C. and Lo, Wan-Yen and Dollár, Piotr and Girshick, Ross},
  date        = {2023-04-05},
  title       = {Segment Anything},
  doi         = {10.48550/ARXIV.2304.02643},
  eprint      = {2304.02643},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Kirillov2023 - Segment Anything.pdf:PDF:http\://arxiv.org/pdf/2304.02643v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Shermaine2025,
  author      = {Shermaine, Ang Jia Ning and Lazarou, Michalis and Stathaki, Tania},
  date        = {2025-02-19},
  title       = {Image compositing is all you need for data augmentation},
  doi         = {10.48550/ARXIV.2502.13936},
  eprint      = {2502.13936},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Shermaine2025 - Image Compositing Is All You Need for Data Augmentation.pdf:PDF:http\://arxiv.org/pdf/2502.13936v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Le2015,
  author  = {Le, Yann and Yang, Xuan},
  title   = {Tiny imagenet visual recognition challenge},
  number  = {7},
  pages   = {3},
  volume  = {7},
  file    = {:Le2015 - Tiny Imagenet Visual Recognition Challenge.pdf:PDF},
  journal = {CS 231N},
  year    = {2015},
}

@Book{Jonhson1995,
  author    = {Jonhson, Norman L. and Kotz, Samuel and Balakrishnan, N.},
  title     = {Continuous Univariate Distributions},
  edition   = {2},
  isbn      = {0-471-58494-0},
  note      = {Wiley series in probability and mathematical statistics},
  publisher = {Wiley},
  series    = {Wiley series in probability and mathematical statistics.},
  year      = {1995},
}

@TechReport{Maji2013,
  author        = {S. Maji and J. Kannala and E. Rahtu and M. Blaschko and A. Vedaldi},
  title         = {Fine-Grained Visual Classification of Aircraft},
  eprint        = {1306.5151},
  archiveprefix = {arXiv},
  primaryclass  = {cs-cv},
  year          = {2013},
}

@Article{Dehghan2017,
  author      = {Dehghan, Afshin and Masood, Syed Zain and Shu, Guang and Ortiz, Enrique. G.},
  date        = {2017-02-06},
  title       = {View Independent Vehicle Make, Model and Color Recognition Using Convolutional Neural Network},
  doi         = {10.48550/ARXIV.1702.01721},
  eprint      = {1702.01721},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Dehghan2017 - View Independent Vehicle Make, Model and Color Recognition Using Convolutional Neural Network.pdf:PDF:http\://arxiv.org/pdf/1702.01721v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2017},
}

@Article{Kaur2017,
  author      = {Kaur, Parneet and Sikka, Karan and Divakaran, Ajay},
  date        = {2017-12-23},
  title       = {Combining Weakly and Webly Supervised Learning for Classifying Food Images},
  doi         = {10.48550/ARXIV.1712.08730},
  eprint      = {1712.08730},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Kaur2017 - Combining Weakly and Webly Supervised Learning for Classifying Food Images.pdf:PDF:http\://arxiv.org/pdf/1712.08730v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2017},
}

@InProceedings{Parkhi2012,
  author    = {Omkar M. Parkhi and Andrea Vedaldi and Andrew Zisserman and C. V. Jawahar},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
  title     = {Cats and Dogs},
  year      = {2012},
}

@Article{Selvaraju2016,
  author       = {Selvaraju, Ramprasaath R. and Cogswell, Michael and Das, Abhishek and Vedantam, Ramakrishna and Parikh, Devi and Batra, Dhruv},
  date         = {2016-10-07},
  journaltitle = {International Journal of Computer Vision},
  title        = {Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization},
  doi          = {10.1007/s11263-019-01228-7},
  eprint       = {1610.02391},
  eprintclass  = {cs.CV},
  eprinttype   = {arXiv},
  issn         = {1573-1405},
  number       = {2},
  pages        = {336--359},
  volume       = {128},
  copyright    = {arXiv.org perpetual, non-exclusive license},
  file         = {:Selvaraju2016 - Grad CAM_ Visual Explanations from Deep Networks Via Gradient Based Localization.pdf:PDF:http\://arxiv.org/pdf/1610.02391v4},
  keywords     = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  month        = oct,
  publisher    = {Springer Science and Business Media LLC},
  year         = {2016},
}

@InProceedings{Chattopadhay2018,
  author    = {Chattopadhay, Aditya and Sarkar, Anirban and Howlader, Prantik and Balasubramanian, Vineeth N},
  booktitle = {2018 IEEE Winter Conference on Applications of Computer Vision (WACV)},
  title     = {Grad-CAM++: Generalized Gradient-Based Visual Explanations for Deep Convolutional Networks},
  doi       = {10.1109/WACV.2018.00097},
  pages     = {839-847},
  keywords  = {Visualization;Heating systems;Neurons;Machine learning;Predictive models;Mathematical model},
  year      = {2018},
}

@Article{Sundararajan2017,
  author      = {Sundararajan, Mukund and Taly, Ankur and Yan, Qiqi},
  date        = {2017-03-04},
  title       = {Axiomatic Attribution for Deep Networks},
  doi         = {10.48550/ARXIV.1703.01365},
  eprint      = {1703.01365},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Sundararajan2017 - Axiomatic Attribution for Deep Networks.pdf:PDF:http\://arxiv.org/pdf/1703.01365v2},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2017},
}

@Article{Carion2020,
  author      = {Carion, Nicolas and Massa, Francisco and Synnaeve, Gabriel and Usunier, Nicolas and Kirillov, Alexander and Zagoruyko, Sergey},
  date        = {2020-05-26},
  title       = {End-to-End Object Detection with Transformers},
  doi         = {10.48550/ARXIV.2005.12872},
  eprint      = {2005.12872},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Zero v1.0 Universal},
  file        = {:Carion2020 - End to End Object Detection with Transformers.pdf:PDF:http\://arxiv.org/pdf/2005.12872v3},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2020},
}

@Article{Zong2022,
  author      = {Zong, Zhuofan and Song, Guanglu and Liu, Yu},
  date        = {2022-11-22},
  title       = {DETRs with Collaborative Hybrid Assignments Training},
  doi         = {10.48550/ARXIV.2211.12860},
  eprint      = {2211.12860},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Zong2022 - DETRs with Collaborative Hybrid Assignments Training.pdf:PDF:http\://arxiv.org/pdf/2211.12860v6},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@Article{Wang2022a,
  author      = {Wang, Wenhui and Bao, Hangbo and Dong, Li and Bjorck, Johan and Peng, Zhiliang and Liu, Qiang and Aggarwal, Kriti and Mohammed, Owais Khan and Singhal, Saksham and Som, Subhojit and Wei, Furu},
  date        = {2022-08-22},
  title       = {Image as a Foreign Language: BEiT Pretraining for All Vision and Vision-Language Tasks},
  doi         = {10.48550/ARXIV.2208.10442},
  eprint      = {2208.10442},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Wang2022a - Image As a Foreign Language_ BEiT Pretraining for All Vision and Vision Language Tasks.pdf:PDF:http\://arxiv.org/pdf/2208.10442v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@Article{Liu2022d,
  author      = {Liu, Yue and Matsoukas, Christos and Strand, Fredrik and Azizpour, Hossein and Smith, Kevin},
  date        = {2022-08-10},
  title       = {PatchDropout: Economizing Vision Transformers Using Patch Dropout},
  doi         = {10.48550/ARXIV.2208.07220},
  eprint      = {2208.07220},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Liu2022d - PatchDropout_ Economizing Vision Transformers Using Patch Dropout.pdf:PDF:http\://arxiv.org/pdf/2208.07220v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@Article{He2017,
  author      = {He, Kaiming and Gkioxari, Georgia and Dollár, Piotr and Girshick, Ross},
  date        = {2017-03-20},
  title       = {Mask R-CNN},
  doi         = {10.48550/ARXIV.1703.06870},
  eprint      = {1703.06870},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:He2017 - Mask R CNN.pdf:PDF:http\://arxiv.org/pdf/1703.06870v3},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2017},
}

@InBook{Sanderson2022,
  author    = {Sanderson, Edward and Matuszewski, Bogdan J.},
  booktitle = {Medical Image Understanding and Analysis},
  date      = {2022},
  title     = {FCN-Transformer Feature Fusion for Polyp Segmentation},
  doi       = {10.1007/978-3-031-12053-4_65},
  isbn      = {9783031120534},
  pages     = {892--907},
  publisher = {Springer International Publishing},
  issn      = {1611-3349},
  year      = {2022},
}

@Article{Vezakis2024,
  author      = {Vezakis, Ioannis A. and Georgas, Konstantinos and Fotiadis, Dimitrios and Matsopoulos, George K.},
  date        = {2024-07-23},
  title       = {EffiSegNet: Gastrointestinal Polyp Segmentation through a Pre-Trained EfficientNet-based Network with a Simplified Decoder},
  doi         = {10.48550/ARXIV.2407.16298},
  eprint      = {2407.16298},
  eprintclass = {eess.IV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Vezakis2024 - EffiSegNet_ Gastrointestinal Polyp Segmentation through a Pre Trained EfficientNet Based Network with a Simplified Decoder.pdf:PDF:http\://arxiv.org/pdf/2407.16298v1},
  keywords    = {Image and Video Processing (eess.IV), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Wang2022b,
  author      = {Wang, Wenhai and Dai, Jifeng and Chen, Zhe and Huang, Zhenhang and Li, Zhiqi and Zhu, Xizhou and Hu, Xiaowei and Lu, Tong and Lu, Lewei and Li, Hongsheng and Wang, Xiaogang and Qiao, Yu},
  date        = {2022-11-10},
  title       = {InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions},
  doi         = {10.48550/ARXIV.2211.05778},
  eprint      = {2211.05778},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Wang2022b - InternImage_ Exploring Large Scale Vision Foundation Models with Deformable Convolutions.pdf:PDF:http\://arxiv.org/pdf/2211.05778v4},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@Article{Girshick2013,
  author      = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra},
  date        = {2013-11-11},
  title       = {Rich feature hierarchies for accurate object detection and semantic segmentation},
  doi         = {10.48550/ARXIV.1311.2524},
  eprint      = {1311.2524},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Girshick2013 - Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation.pdf:PDF:http\://arxiv.org/pdf/1311.2524v5},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2013},
}

@InProceedings{Krizhevsky2012,
  author    = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  booktitle = {Advances in Neural Information Processing Systems},
  title     = {ImageNet Classification with Deep Convolutional Neural Networks},
  editor    = {F. Pereira and C.J. Burges and L. Bottou and K.Q. Weinberger},
  publisher = {Curran Associates, Inc.},
  url       = {https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf},
  volume    = {25},
  year      = {2012},
}

@Article{Rangel2024,
  author       = {Rangel, Gabriela and Cuevas-Tello, Juan C. and Nunez-Varela, Jose and Puente, Cesar and Silva-Trujillo, Alejandra G.},
  date         = {2024-01},
  journaltitle = {Journal of Sensors},
  title        = {A Survey on Convolutional Neural Networks and Their Performance Limitations in Image Recognition Tasks},
  doi          = {10.1155/2024/2797320},
  editor       = {Feng, Lihang},
  issn         = {1687-7268},
  number       = {1},
  volume       = {2024},
  publisher    = {Wiley},
  year         = {2024},
}

@Article{Alomar2023,
  author       = {Alomar, Khaled and Aysel, Halil Ibrahim and Cai, Xiaohao},
  date         = {2023-02},
  journaltitle = {Journal of Imaging},
  title        = {Data Augmentation in Classification and Segmentation: A Survey and New Strategies},
  doi          = {10.3390/jimaging9020046},
  issn         = {2313-433X},
  number       = {2},
  pages        = {46},
  volume       = {9},
  publisher    = {MDPI AG},
  year         = {2023},
}

@Article{RojasGomez2023,
  author      = {Rojas-Gomez, Renan A. and Lim, Teck-Yian and Do, Minh N. and Yeh, Raymond A.},
  date        = {2023-05-25},
  title       = {Making Vision Transformers Truly Shift-Equivariant},
  doi         = {10.48550/ARXIV.2305.16316},
  eprint      = {2305.16316},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:RojasGomez2023 - Making Vision Transformers Truly Shift Equivariant (1).pdf:PDF:http\://arxiv.org/pdf/2305.16316v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Ding2023a,
  author      = {Ding, Peijian and Soselia, Davit and Armstrong, Thomas and Su, Jiahao and Huang, Furong},
  date        = {2023-06-13},
  title       = {Reviving Shift Equivariance in Vision Transformers},
  doi         = {10.48550/ARXIV.2306.07470},
  eprint      = {2306.07470},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Ding2023a - Reviving Shift Equivariance in Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2306.07470v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Bates1955,
  author  = {Bates, G.E.},
  title   = {Joint distributions of time intervals for the occurrence of successive accidents in a generalized Polya urn scheme},
  pages   = {705–720},
  volume  = {26},
  journal = {Annals of Mathematical Statistics},
  year    = {1955},
}

@Article{Adebayo2018,
  author      = {Adebayo, Julius and Gilmer, Justin and Muelly, Michael and Goodfellow, Ian and Hardt, Moritz and Kim, Been},
  date        = {2018-10-08},
  title       = {Sanity Checks for Saliency Maps},
  doi         = {10.48550/ARXIV.1810.03292},
  eprint      = {1810.03292},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Adebayo2018 - Sanity Checks for Saliency Maps.pdf:PDF:http\://arxiv.org/pdf/1810.03292v3},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2018},
}

@Article{Huang2016a,
  author      = {Huang, Jonathan and Rathod, Vivek and Sun, Chen and Zhu, Menglong and Korattikara, Anoop and Fathi, Alireza and Fischer, Ian and Wojna, Zbigniew and Song, Yang and Guadarrama, Sergio and Murphy, Kevin},
  date        = {2016-11-30},
  title       = {Speed/accuracy trade-offs for modern convolutional object detectors},
  doi         = {10.48550/ARXIV.1611.10012},
  eprint      = {1611.10012},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Huang2016a - Speed_accuracy Trade Offs for Modern Convolutional Object Detectors.pdf:PDF:http\://arxiv.org/pdf/1611.10012v3},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2016},
}

@Article{Shen2023,
  author         = {Shen, Yunhang and Fu, Chaoyou and Chen, Peixian and Zhang, Mengdan and Li, Ke and Sun, Xing and Wu, Yunsheng and Lin, Shaohui and Ji, Rongrong},
  date           = {2023-12-04},
  title          = {Aligning and Prompting Everything All at Once for Universal Visual Perception},
  doi            = {10.48550/ARXIV.2312.02153},
  eprint         = {2312.02153},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Shen2023 - Aligning and Prompting Everything All at Once for Universal Visual Perception.pdf:PDF:http\://arxiv.org/pdf/2312.02153v1},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2023},
}

@Article{Sinhamahapatra2024,
  author         = {Sinhamahapatra, Poulami and Schwaiger, Franziska and Bose, Shirsha and Wang, Huiyu and Roscher, Karsten and Guennemann, Stephan},
  date           = {2024-04-11},
  title          = {Finding Dino: A Plug-and-Play Framework for Zero-Shot Detection of Out-of-Distribution Objects Using Prototypes},
  doi            = {10.48550/ARXIV.2404.07664},
  eprint         = {2404.07664},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Sinhamahapatra2024 - Finding Dino_ a Plug and Play Framework for Zero Shot Detection of Out of Distribution Objects Using Prototypes.pdf:PDF:http\://arxiv.org/pdf/2404.07664v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Li2022a,
  author         = {Li, Feng and Zhang, Hao and xu, Huaizhe and Liu, Shilong and Zhang, Lei and Ni, Lionel M. and Shum, Heung-Yeung},
  date           = {2022-06-06},
  title          = {Mask DINO: Towards A Unified Transformer-based Framework for Object Detection and Segmentation},
  doi            = {10.48550/ARXIV.2206.02777},
  eprint         = {2206.02777},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {arXiv.org perpetual, non-exclusive license},
  file           = {:Li2022a - Mask DINO_ Towards a Unified Transformer Based Framework for Object Detection and Segmentation.pdf:PDF:http\://arxiv.org/pdf/2206.02777v3},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {skimmed},
  year           = {2022},
}

@Article{Maninis2024,
  author         = {Maninis, Kevis-Kokitsi and Chen, Kaifeng and Ghosh, Soham and Karpur, Arjun and Chen, Koert and Xia, Ye and Cao, Bingyi and Salz, Daniel and Han, Guangxing and Dlabal, Jan and Gnanapragasam, Dan and Seyedhosseini, Mojtaba and Zhou, Howard and Araujo, Andre},
  date           = {2024-10-21},
  title          = {TIPS: Text-Image Pretraining with Spatial awareness},
  doi            = {10.48550/ARXIV.2410.16512},
  eprint         = {2410.16512},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Maninis2024 - TIPS_ Text Image Pretraining with Spatial Awareness.pdf:PDF:http\://arxiv.org/pdf/2410.16512v2},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Naeem2023,
  author      = {Naeem, Muhammad Ferjad and Xian, Yongqin and Zhai, Xiaohua and Hoyer, Lukas and Van Gool, Luc and Tombari, Federico},
  date        = {2023-10-20},
  title       = {SILC: Improving Vision Language Pretraining with Self-Distillation},
  doi         = {10.48550/ARXIV.2310.13355},
  eprint      = {2310.13355},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Naeem2023 - SILC_ Improving Vision Language Pretraining with Self Distillation.pdf:PDF:http\://arxiv.org/pdf/2310.13355v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Xiao2024,
  author         = {Xiao, Weiwei and Chen, Yongyong and Shan, Qiben and Wang, Yaowei and Su, Jingyong},
  date           = {2024-03},
  journaltitle   = {Proceedings of the AAAI Conference on Artificial Intelligence},
  title          = {Feature Distribution Matching by Optimal Transport for Effective and Robust Coreset Selection},
  doi            = {10.1609/aaai.v38i8.28771},
  issn           = {2159-5399},
  number         = {8},
  pages          = {9196--9204},
  volume         = {38},
  file           = {:Xiao2024 - Feature Distribution Matching by Optimal Transport for Effective and Robust Coreset Selection.pdf:PDF},
  groups         = {Coreset for FL},
  publisher      = {Association for the Advancement of Artificial Intelligence (AAAI)},
  qualityassured = {qualityAssured},
  readstatus     = {read},
}

@InProceedings{Yang2024d,
  author    = {Yang, Shuo and Cao, Zhe and Guo, Sheng and Zhang, Ruiheng and Luo, Ping and Zhang, Shengping and Nie, Liqiang},
  booktitle = {Proceedings of the 41st International Conference on Machine Learning},
  title     = {Mind the Boundary: Coreset Selection via Reconstructing the Decision Boundary},
  editor    = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},
  pages     = {55948--55960},
  publisher = {PMLR},
  series    = {Proceedings of Machine Learning Research},
  url       = {https://proceedings.mlr.press/v235/yang24b.html},
  volume    = {235},
  file      = {:Yang2024d - Mind the Boundary_ Coreset Selection Via Reconstructing the Decision Boundary.pdf:PDF},
  groups    = {Coreset for FL},
  month     = {21--27 Jul},
  pdf       = {https://raw.githubusercontent.com/mlresearch/v235/main/assets/yang24b/yang24b.pdf},
  year      = {2024},
}

@Article{Guo2024b,
  author         = {Guo, Yangyang and Kankanhalli, Mohan},
  date           = {2024-11-14},
  title          = {SCAN: Bootstrapping Contrastive Pre-training for Data Efficiency},
  doi            = {10.48550/ARXIV.2411.09126},
  eprint         = {2411.09126},
  eprintclass    = {cs.CV},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Guo2024b - SCAN_ Bootstrapping Contrastive Pre Training for Data Efficiency.pdf:PDF:http\://arxiv.org/pdf/2411.09126v1},
  groups         = {Coreset for FL},
  keywords       = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2024},
}

@Article{Nauen2025a,
  author      = {Nauen, Tobias Christian and Moser, Brian and Raue, Federico and Frolov, Stanislav and Dengel, Andreas},
  date        = {2025-03-12},
  title       = {ForAug: Recombining Foregrounds and Backgrounds to Improve Vision Transformer Training with Bias Mitigation},
  doi         = {10.48550/ARXIV.2503.09399},
  eprint      = {2503.09399},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Nauen2025 - ForAug_ Recombining Foregrounds and Backgrounds to Improve Vision Transformer Training with Bias Mitigation.pdf:PDF:http\://arxiv.org/pdf/2503.09399v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, I.2.10; I.2.6; I.4.6, 68T45},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Mehra2025,
  author         = {Mehra, Akshay and Mittal, Trisha and Gopalakrishnan, Subhadra and Kimball, Joshua},
  date           = {2025-02-23},
  title          = {Model-agnostic Coreset Selection via LLM-based Concept Bottlenecks},
  doi            = {10.48550/ARXIV.2502.16733},
  eprint         = {2502.16733},
  eprintclass    = {cs.LG},
  eprinttype     = {arXiv},
  copyright      = {Creative Commons Attribution 4.0 International},
  file           = {:Mehra2025 - Model Agnostic Coreset Selection Via LLM Based Concept Bottlenecks.pdf:PDF:http\://arxiv.org/pdf/2502.16733v1},
  groups         = {Coreset for FL},
  keywords       = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher      = {arXiv},
  qualityassured = {qualityAssured},
  readstatus     = {read},
  year           = {2025},
}

@Article{Zhu2025,
  author      = {Zhu, Jiachen and Chen, Xinlei and He, Kaiming and LeCun, Yann and Liu, Zhuang},
  date        = {2025-03-13},
  title       = {Transformers without Normalization},
  doi         = {10.48550/ARXIV.2503.10622},
  eprint      = {2503.10622},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhu2025 - Transformers without Normalization.pdf:PDF:http\://arxiv.org/pdf/2503.10622v1},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Dorszewski2025,
  author      = {Dorszewski, Teresa and Tětková, Lenka and Jenssen, Robert and Hansen, Lars Kai and Wickstrøm, Kristoffer Knutsen},
  date        = {2025-03-31},
  title       = {From Colors to Classes: Emergence of Concepts in Vision Transformers},
  doi         = {10.48550/ARXIV.2503.24071},
  eprint      = {2503.24071},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {Vision Transformers (ViTs) are increasingly utilized in various computer vision tasks due to their powerful representation capabilities. However, it remains understudied how ViTs process information layer by layer. Numerous studies have shown that convolutional neural networks (CNNs) extract features of increasing complexity throughout their layers, which is crucial for tasks like domain adaptation and transfer learning. ViTs, lacking the same inductive biases as CNNs, can potentially learn global dependencies from the first layers due to their attention mechanisms. Given the increasing importance of ViTs in computer vision, there is a need to improve the layer-wise understanding of ViTs. In this work, we present a novel, layer-wise analysis of concepts encoded in state-of-the-art ViTs using neuron labeling. Our findings reveal that ViTs encode concepts with increasing complexity throughout the network. Early layers primarily encode basic features such as colors and textures, while later layers represent more specific classes, including objects and animals. As the complexity of encoded concepts increases, the number of concepts represented in each layer also rises, reflecting a more diverse and specific set of features. Additionally, different pretraining strategies influence the quantity and category of encoded concepts, with finetuning to specific downstream tasks generally reducing the number of encoded concepts and shifting the concepts to more relevant categories.},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Dorszewski2025 - From Colors to Classes_ Emergence of Concepts in Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2503.24071v1},
  groups      = {Reading Group Potential},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Imam2024,
  author      = {Imam, Mohamed Fazli and Marew, Rufael Fedaku and Hassan, Jameel and Fiaz, Mustansar and Aji, Alham Fikri and Cholakkal, Hisham},
  date        = {2024-11-28},
  title       = {CLIP meets DINO for Tuning Zero-Shot Classifier using Unlabeled Image Collections},
  doi         = {10.48550/ARXIV.2411.19346},
  eprint      = {2411.19346},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Imam2024 - CLIP Meets DINO for Tuning Zero Shot Classifier Using Unlabeled Image Collections.pdf:PDF:http\://arxiv.org/pdf/2411.19346v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Kerssies2025,
  author      = {Kerssies, Tommie and Cavagnero, Niccolò and Hermans, Alexander and Norouzi, Narges and Averta, Giuseppe and Leibe, Bastian and Dubbelman, Gijs and de Geus, Daan},
  date        = {2025-03-24},
  title       = {Your ViT is Secretly an Image Segmentation Model},
  doi         = {10.48550/ARXIV.2503.19108},
  eprint      = {2503.19108},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Kerssies2025 - Your ViT Is Secretly an Image Segmentation Model.pdf:PDF:http\://arxiv.org/pdf/2503.19108v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Somvanshi2025,
  author      = {Somvanshi, Shriyank and Islam, Md Monzurul and Mimi, Mahmuda Sultana and Polock, Sazzad Bin Bashar and Chhetri, Gaurab and Das, Subasish},
  date        = {2025-03-22},
  title       = {A Survey on Structured State Space Sequence (S4) Models},
  doi         = {10.48550/ARXIV.2503.18970},
  eprint      = {2503.18970},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Somvanshi2025 - A Survey on Structured State Space Sequence (S4) Models.pdf:PDF:http\://arxiv.org/pdf/2503.18970v1},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Wang2025,
  author      = {Wang, Qin and Bruns, Benjamin and Scharr, Hanno and Krajsek, Kai},
  date        = {2025-03-24},
  title       = {Self-Supervised Learning based on Transformed Image Reconstruction for Equivariance-Coherent Feature Representation},
  doi         = {10.48550/ARXIV.2503.18753},
  eprint      = {2503.18753},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Wang2025 - Self Supervised Learning Based on Transformed Image Reconstruction for Equivariance Coherent Feature Representation.pdf:PDF:http\://arxiv.org/pdf/2503.18753v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Hesse2025,
  author      = {Hesse, Robin and Bağcı, Doğukan and Schiele, Bernt and Schaub-Meyer, Simone and Roth, Stefan},
  date        = {2025-03-21},
  title       = {Beyond Accuracy: What Matters in Designing Well-Behaved Models?},
  doi         = {10.48550/ARXIV.2503.17110},
  eprint      = {2503.17110},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Hesse2025 - Beyond Accuracy_ What Matters in Designing Well Behaved Models_.pdf:PDF:http\://arxiv.org/pdf/2503.17110v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Tang2025,
  author      = {Tang, Zineng and Lian, Long and Eisape, Seun and Wang, XuDong and Herzig, Roei and Yala, Adam and Suhr, Alane and Darrell, Trevor and Chan, David M.},
  date        = {2025-03-19},
  title       = {TULIP: Towards Unified Language-Image Pretraining},
  doi         = {10.48550/ARXIV.2503.15485},
  eprint      = {2503.15485},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Tang2025 - TULIP_ Towards Unified Language Image Pretraining.pdf:PDF:http\://arxiv.org/pdf/2503.15485v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Maity2025,
  author      = {Maity, Subhajit and Hitsman, Killian and Li, Xin and Dutta, Aritra},
  date        = {2025-03-13},
  title       = {Kolmogorov-Arnold Attention: Is Learnable Attention Better For Vision Transformers?},
  doi         = {10.48550/ARXIV.2503.10632},
  eprint      = {2503.10632},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Maity2025 - Kolmogorov Arnold Attention_ Is Learnable Attention Better for Vision Transformers_.pdf:PDF:http\://arxiv.org/pdf/2503.10632v1},
  keywords    = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, I.2.6; I.5.1; I.5.5; I.5.4; I.4.10, 68T07},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Hammoud2025,
  author      = {Hammoud, Hasan Abed Al Kader and Ghanem, Bernard},
  date        = {2025-03-09},
  title       = {DiffCLIP: Differential Attention Meets CLIP},
  doi         = {10.48550/ARXIV.2503.06626},
  eprint      = {2503.06626},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Hammoud2025 - DiffCLIP_ Differential Attention Meets CLIP.pdf:PDF:http\://arxiv.org/pdf/2503.06626v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Luo2025,
  author      = {Luo, Alan and Yuan, Kaiwen},
  date        = {2025-03-06},
  title       = {Simple Self Organizing Map with Visual Transformer},
  doi         = {10.48550/ARXIV.2503.04121},
  eprint      = {2503.04121},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Luo2025 - Simple Self Organizing Map with Visual Transformer.pdf:PDF:http\://arxiv.org/pdf/2503.04121v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, 65D19 (Primary)},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Li2025,
  author      = {Li, Ruining and Boduljak, Gabrijel and Jensen, and {Zhou}},
  date        = {2025-04-03},
  title       = {On Vanishing Variance in Transformer Length Generalization},
  doi         = {10.48550/ARXIV.2504.02827},
  eprint      = {2504.02827},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Li2025 - On Vanishing Variance in Transformer Length Generalization.pdf:PDF:http\://arxiv.org/pdf/2504.02827v1},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Zhang2025,
  author      = {Zhang, Zherui and Xu, Rongtao and Zhou, Jie and Wang, Changwei and Pei, Xingtian and Xu, Wenhao and Zhang, Jiguang and Guo, Li and Gao, Longxiang and Xu, Wenbo and Xu, Shibiao},
  date        = {2025-05-06},
  title       = {Image Recognition with Online Lightweight Vision Transformer: A Survey},
  doi         = {10.48550/ARXIV.2505.03113},
  eprint      = {2505.03113},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Zhang2025 - Image Recognition with Online Lightweight Vision Transformer_ a Survey.pdf:PDF:http\://arxiv.org/pdf/2505.03113v1},
  groups      = {WTF Benchmark},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Xiao2023,
  author      = {Xiao, Guangxuan and Tian, Yuandong and Chen, Beidi and Han, Song and Lewis, Mike},
  date        = {2023-09-29},
  title       = {Efficient Streaming Language Models with Attention Sinks},
  doi         = {10.48550/ARXIV.2309.17453},
  eprint      = {2309.17453},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Xiao2023 - Efficient Streaming Language Models with Attention Sinks.pdf:PDF:http\://arxiv.org/pdf/2309.17453v4},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Joseph2025,
  author      = {Joseph, Sonia and Suresh, Praneet and Goldfarb, Ethan and Hufe, Lorenz and Gandelsman, Yossi and Graham, Robert and Bzdok, Danilo and Samek, Wojciech and Richards, Blake Aaron},
  date        = {2025-04-11},
  title       = {Steering CLIP's vision transformer with sparse autoencoders},
  doi         = {10.48550/ARXIV.2504.08729},
  eprint      = {2504.08729},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Joseph2025 - Steering CLIP's Vision Transformer with Sparse Autoencoders.pdf:PDF:http\://arxiv.org/pdf/2504.08729v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Qian2025,
  author      = {Qian, Zhoujie},
  date        = {2025-04-21},
  title       = {ECViT: Efficient Convolutional Vision Transformer with Local-Attention and Multi-scale Stages},
  doi         = {10.48550/ARXIV.2504.14825},
  eprint      = {2504.14825},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Qian2025 - ECViT_ Efficient Convolutional Vision Transformer with Local Attention and Multi Scale Stages.pdf:PDF:http\://arxiv.org/pdf/2504.14825v1},
  groups      = {WTF Benchmark},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Ji2025,
  author      = {Ji, Yiping and Saratchandran, Hemanth and Moghaddam, Peyman and Lucey, Simon},
  date        = {2025-05-04},
  title       = {Always Skip Attention},
  doi         = {10.48550/ARXIV.2505.01996},
  eprint      = {2505.01996},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Ji2025 - Always Skip Attention.pdf:PDF:http\://arxiv.org/pdf/2505.01996v1},
  groups      = {WTF Benchmark},
  keywords    = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Yamada2025,
  author      = {Yamada, Yoshihiro},
  date        = {2025-04-09},
  title       = {CAT: Circular-Convolutional Attention for Sub-Quadratic Transformers},
  doi         = {10.48550/ARXIV.2504.06704},
  eprint      = {2504.06704},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Yamada2025 - CAT_ Circular Convolutional Attention for Sub Quadratic Transformers.pdf:PDF:http\://arxiv.org/pdf/2504.06704v1},
  groups      = {WTF Benchmark},
  keywords    = {Machine Learning (cs.LG), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Liu2025,
  author      = {Liu, Jiani and Wang, Zhiyuan and Zhang, Zeliang and Huang, Chao and Liang, Susan and Tang, Yunlong and Xu, Chenliang},
  date        = {2025-04-15},
  title       = {The Sword of Damocles in ViTs: Computational Redundancy Amplifies Adversarial Transferability},
  doi         = {10.48550/ARXIV.2504.10804},
  eprint      = {2504.10804},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Liu2025 - The Sword of Damocles in ViTs_ Computational Redundancy Amplifies Adversarial Transferability.pdf:PDF:http\://arxiv.org/pdf/2504.10804v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Trivedy2025,
  author      = {Trivedy, Vivek and Almalki, Amani and Latecki, Longin Jan},
  date        = {2025-04-10},
  title       = {Learning Object Focused Attention},
  doi         = {10.48550/ARXIV.2504.08166},
  eprint      = {2504.08166},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Trivedy2025 - Learning Object Focused Attention.pdf:PDF:http\://arxiv.org/pdf/2504.08166v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Dey2025,
  author      = {Dey, Nolan and Zhang, Bin Claire and Noci, Lorenzo and Li, Mufan and Bordelon, Blake and Bergsma, Shane and Pehlevan, Cengiz and Hanin, Boris and Hestness, Joel},
  date        = {2025-05-02},
  title       = {Don't be lazy: CompleteP enables compute-efficient deep transformers},
  doi         = {10.48550/ARXIV.2505.01618},
  eprint      = {2505.01618},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Dey2025 - Don't Be Lazy_ CompleteP Enables Compute Efficient Deep Transformers.pdf:PDF:http\://arxiv.org/pdf/2505.01618v1},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Zuhri2025,
  author      = {Zuhri, Zayd M. K. and Fuadi, Erland Hilman and Aji, Alham Fikri},
  date        = {2025-04-29},
  title       = {Softpick: No Attention Sink, No Massive Activations with Rectified Softmax},
  doi         = {10.48550/ARXIV.2504.20966},
  eprint      = {2504.20966},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Share Alike 4.0 International},
  file        = {:Zuhri2025 - Softpick_ No Attention Sink, No Massive Activations with Rectified Softmax.pdf:PDF:http\://arxiv.org/pdf/2504.20966v1},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Bolya2025,
  author      = {Bolya, Daniel and Huang, Po-Yao and Sun, Peize and Cho, Jang Hyun and Madotto, Andrea and Wei, Chen and Ma, Tengyu and Zhi, Jiale and Rajasegaran, Jathushan and Rasheed, Hanoona and Wang, Junke and Monteiro, Marco and Xu, Hu and Dong, Shiyu and Ravi, Nikhila and Li, Daniel and Dollár, Piotr and Feichtenhofer, Christoph},
  date        = {2025-04-17},
  title       = {Perception Encoder: The best visual embeddings are not at the output of the network},
  doi         = {10.48550/ARXIV.2504.13181},
  eprint      = {2504.13181},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Bolya2025 - Perception Encoder_ the Best Visual Embeddings Are Not at the Output of the Network.pdf:PDF:http\://arxiv.org/pdf/2504.13181v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Kang2022,
  author       = {Kang, Ji-Soo and Chung, Kyungyong},
  date         = {2022},
  journaltitle = {IEEE Access},
  title        = {STAug: Copy-Paste Based Image Augmentation Technique Using Salient Target},
  doi          = {10.1109/access.2022.3224141},
  issn         = {2169-3536},
  pages        = {123605--123613},
  volume       = {10},
  file         = {:Kang2022 - STAug_ Copy Paste Based Image Augmentation Technique Using Salient Target.pdf:PDF},
  priority     = {prio1},
  publisher    = {Institute of Electrical and Electronics Engineers (IEEE)},
}

@Article{Guo2023,
  author      = {Guo, Yilu and Shi, Xingyue and Chen, Weijie and Yang, Shicai and Xie, Di and Pu, Shiliang and Zhuang, Yueting},
  date        = {2023-01-12},
  title       = {1st Place Solution for ECCV 2022 OOD-CV Challenge Image Classification Track},
  doi         = {10.48550/ARXIV.2301.04795},
  eprint      = {2301.04795},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Guo2023 - 1st Place Solution for ECCV 2022 OOD CV Challenge Image Classification Track.pdf:PDF:http\://arxiv.org/pdf/2301.04795v1},
  groups      = {ForAug},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Lappe2025,
  author      = {Lappe, Alexander and Giese, Martin A.},
  date        = {2025-05-09},
  title       = {Register and CLS tokens yield a decoupling of local and global features in large ViTs},
  doi         = {10.48550/ARXIV.2505.05892},
  eprint      = {2505.05892},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Lappe2025 - Register and CLS Tokens Yield a Decoupling of Local and Global Features in Large ViTs.pdf:PDF:http\://arxiv.org/pdf/2505.05892v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Adeel2025,
  author      = {Adeel, Ahsan},
  date        = {2025-05-02},
  title       = {Beyond Attention: Toward Machines with Intrinsic Higher Mental States},
  doi         = {10.48550/ARXIV.2505.06257},
  eprint      = {2505.06257},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International},
  file        = {:Adeel2025 - Beyond Attention_ toward Machines with Intrinsic Higher Mental States.pdf:PDF:http\://arxiv.org/pdf/2505.06257v1},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Gerber2025,
  author      = {Gerber, Isaac},
  date        = {2025-05-10},
  title       = {Attention Is Not All You Need: The Importance of Feedforward Networks in Transformer Models},
  doi         = {10.48550/ARXIV.2505.06633},
  eprint      = {2505.06633},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Gerber2025 - Attention Is Not All You Need_ the Importance of Feedforward Networks in Transformer Models.pdf:PDF:http\://arxiv.org/pdf/2505.06633v1},
  keywords    = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Kan2025,
  author      = {Kan, Kelvin and Li, Xingjian and Zhang, Benjamin J. and Sahai, Tuhin and Osher, Stanley and Katsoulakis, Markos A.},
  date        = {2025-05-16},
  title       = {Optimal Control for Transformer Architectures: Enhancing Generalization, Robustness and Efficiency},
  doi         = {10.48550/ARXIV.2505.13499},
  eprint      = {2505.13499},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Kan2025 - Optimal Control for Transformer Architectures_ Enhancing Generalization, Robustness and Efficiency.pdf:PDF:http\://arxiv.org/pdf/2505.13499v1},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Optimization and Control (math.OC), FOS: Computer and information sciences, FOS: Mathematics},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@InProceedings{Amsel2025,
  author    = {Noah Amsel and Gilad Yehudai and Joan Bruna},
  booktitle = {The Thirteenth International Conference on Learning Representations},
  title     = {Quality over Quantity in Attention Layers: When Adding More Heads Hurts},
  url       = {https://openreview.net/forum?id=y9Xp9NozPR},
  file      = {:Amsel2025 - Quality Over Quantity in Attention Layers_ When Adding More Heads Hurts.pdf:PDF},
  priority  = {prio2},
  year      = {2025},
}

@Article{Nordstroem2025,
  author      = {Nordström, David and Edstedt, Johan and Kahl, Fredrik and Bökman, Georg},
  date        = {2025-05-21},
  title       = {Stronger ViTs With Octic Equivariance},
  doi         = {10.48550/ARXIV.2505.15441},
  eprint      = {2505.15441},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Share Alike 4.0 International},
  file        = {:Nordstroem2025 - Stronger ViTs with Octic Equivariance.pdf:PDF:http\://arxiv.org/pdf/2505.15441v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Braso2025,
  author      = {Brasó, Guillem and Ošep, Aljoša and Leal-Taixé, Laura},
  date        = {2025-05-22},
  title       = {Native Segmentation Vision Transformers},
  doi         = {10.48550/ARXIV.2505.16993},
  eprint      = {2505.16993},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Braso2025 - Native Segmentation Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2505.16993v1},
  groups      = {WTF Benchmark},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Shan2025,
  author      = {Shan, Jiquan and Wang, Junxiao and Zhao, Lifeng and Cai, Liang and Zhang, Hongyuan and Liritzis, Ioannis},
  date        = {2025-05-22},
  title       = {AnchorFormer: Differentiable Anchor Attention for Efficient Vision Transformer},
  doi         = {10.48550/ARXIV.2505.16463},
  eprint      = {2505.16463},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Shan2025 - AnchorFormer_ Differentiable Anchor Attention for Efficient Vision Transformer.pdf:PDF:http\://arxiv.org/pdf/2505.16463v1},
  groups      = {WTF Benchmark},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Ye2024a,
  author      = {Ye, Tianzhu and Dong, Li and Xia, Yuqing and Sun, Yutao and Zhu, Yi and Huang, Gao and Wei, Furu},
  date        = {2024-10-07},
  title       = {Differential Transformer},
  doi         = {10.48550/ARXIV.2410.05258},
  eprint      = {2410.05258},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Ye2024a - Differential Transformer.pdf:PDF:http\://arxiv.org/pdf/2410.05258v2},
  keywords    = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Fuller2025,
  author      = {Fuller, Anthony and Yassin, Yousef and Wen, Junfeng and Kyrollos, Daniel G. and Ibrahim, Tarek and Green, James R. and Shelhamer, Evan},
  date        = {2025-05-23},
  title       = {LookWhere? Efficient Visual Recognition by Learning Where to Look and What to See from Self-Supervision},
  doi         = {10.48550/ARXIV.2505.18051},
  eprint      = {2505.18051},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Fuller2025 - LookWhere_ Efficient Visual Recognition by Learning Where to Look and What to See from Self Supervision.pdf:PDF:http\://arxiv.org/pdf/2505.18051v1},
  groups      = {WTF Benchmark},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Shahabodini2025,
  author      = {Shahabodini, Sajjad and Mansoori, Mobina and Bayatmakou, Farnoush and Abouei, Jamshid and Plataniotis, Konstantinos N. and Mohammadi, Arash},
  date        = {2025-05-26},
  title       = {The Missing Point in Vision Transformers for Universal Image Segmentation},
  doi         = {10.48550/ARXIV.2505.19795},
  eprint      = {2505.19795},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Shahabodini2025 - The Missing Point in Vision Transformers for Universal Image Segmentation.pdf:PDF:http\://arxiv.org/pdf/2505.19795v1},
  groups      = {WTF Benchmark},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Image and Video Processing (eess.IV), FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Zheng2025a,
  author      = {Zheng, Jianqiao and Li, Xueqian and Saratchandran, Hemanth and Lucey, Simon},
  date        = {2025-05-26},
  title       = {Structured Initialization for Vision Transformers},
  doi         = {10.48550/ARXIV.2505.19985},
  eprint      = {2505.19985},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Zheng2025a - Structured Initialization for Vision Transformers.pdf:PDF:http\://arxiv.org/pdf/2505.19985v1},
  groups      = {Reading Group Potential},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Kong2025,
  author      = {Kong, Zhenglun and Li, Yize and Zeng, Fanhu and Xin, Lei and Messica, Shvat and Lin, Xue and Zhao, Pu and Kellis, Manolis and Tang, Hao and Zitnik, Marinka},
  date        = {2025-05-23},
  title       = {Token Reduction Should Go Beyond Efficiency in Generative Models -- From Vision, Language to Multimodality},
  doi         = {10.48550/ARXIV.2505.18227},
  eprint      = {2505.18227},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Kong2025 - Token Reduction Should Go beyond Efficiency in Generative Models from Vision, Language to Multimodality.pdf:PDF:http\://arxiv.org/pdf/2505.18227v1},
  groups      = {WTF Benchmark},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@InProceedings{Chowdhury2025,
  author    = {Chowdhury, Amartya Roy and Diddigi, Raghuram Bharadwaj and J, Prabuchandran K and Tripathi, Achyut Mani},
  booktitle = {Proceedings of the Winter Conference on Applications of Computer Vision (WACV)},
  title     = {Bandit Based Attention Mechanism in Vision Transformers},
  pages     = {9579-9588},
  file      = {:Chowdhury2025 - Bandit Based Attention Mechanism in Vision Transformers.pdf:PDF},
  groups    = {WTF Benchmark},
  month     = {February},
  priority  = {prio3},
  year      = {2025},
}

@Article{Fuller2025a,
  author      = {Fuller, Anthony and Yassin, Yousef and Kyrollos, Daniel G. and Shelhamer, Evan and Green, James R.},
  date        = {2025-02-20},
  title       = {Simpler Fast Vision Transformers with a Jumbo CLS Token},
  doi         = {10.48550/ARXIV.2502.15021},
  eprint      = {2502.15021},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Fuller2025a - Simpler Fast Vision Transformers with a Jumbo CLS Token.pdf:PDF:http\://arxiv.org/pdf/2502.15021v2},
  groups      = {WTF Benchmark},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Yao2025,
  author      = {Yao, Jingfeng and Yang, Bin and Wang, Xinggang},
  date        = {2025-01-02},
  title       = {Reconstruction vs. Generation: Taming Optimization Dilemma in Latent Diffusion Models},
  doi         = {10.48550/ARXIV.2501.01423},
  eprint      = {2501.01423},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Yao2025 - Reconstruction Vs. Generation_ Taming Optimization Dilemma in Latent Diffusion Models.pdf:PDF:http\://arxiv.org/pdf/2501.01423v3},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Darlow2025,
  author      = {Darlow, Luke and Regan, Ciaran and Risi, Sebastian and Seely, Jeffrey and Jones, Llion},
  date        = {2025-05-08},
  title       = {Continuous Thought Machines},
  doi         = {10.48550/ARXIV.2505.05522},
  eprint      = {2505.05522},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Darlow2025 - Continuous Thought Machines.pdf:PDF:http\://arxiv.org/pdf/2505.05522v3},
  groups      = {Reading Group Potential},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Deng2025,
  author      = {Deng, Chaorui and Zhu, Deyao and Li, Kunchang and Gou, Chenhui and Li, Feng and Wang, Zeyu and Zhong, Shu and Yu, Weihao and Nie, Xiaonan and Song, Ziang and Shi, Guang and Fan, Haoqi},
  date        = {2025-05-20},
  title       = {Emerging Properties in Unified Multimodal Pretraining},
  doi         = {10.48550/ARXIV.2505.14683},
  eprint      = {2505.14683},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Deng2025 - Emerging Properties in Unified Multimodal Pretraining.pdf:PDF:http\://arxiv.org/pdf/2505.14683v2},
  groups      = {Reading Group Potential},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Nie2025,
  author      = {Nie, Shen and Zhu, Fengqi and You, Zebin and Zhang, Xiaolu and Ou, Jingyang and Hu, Jun and Zhou, Jun and Lin, Yankai and Wen, Ji-Rong and Li, Chongxuan},
  date        = {2025-02-14},
  title       = {Large Language Diffusion Models},
  doi         = {10.48550/ARXIV.2502.09992},
  eprint      = {2502.09992},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Nie2025 - Large Language Diffusion Models.pdf:PDF:http\://arxiv.org/pdf/2502.09992v2},
  groups      = {Reading Group Potential},
  keywords    = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Jha2025,
  author      = {Jha, Rishi and Zhang, Collin and Shmatikov, Vitaly and Morris, John X.},
  date        = {2025-05-18},
  title       = {Harnessing the Universal Geometry of Embeddings},
  doi         = {10.48550/ARXIV.2505.12540},
  eprint      = {2505.12540},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Jha2025 - Harnessing the Universal Geometry of Embeddings.pdf:PDF:http\://arxiv.org/pdf/2505.12540v2},
  groups      = {Reading Group Potential},
  keywords    = {Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Xu2025,
  author      = {Xu, Yi and Li, Chengzu and Zhou, Han and Wan, Xingchen and Zhang, Caiqi and Korhonen, Anna and Vulić, Ivan},
  date        = {2025-05-16},
  title       = {Visual Planning: Let's Think Only with Images},
  doi         = {10.48550/ARXIV.2505.11409},
  eprint      = {2505.11409},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Xu2025 - Visual Planning_ Let's Think Only with Images.pdf:PDF:http\://arxiv.org/pdf/2505.11409v1},
  groups      = {Reading Group Potential},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2025},
}

@InProceedings{Wang2024f,
  author      = {Qizhou Wang and Yong Lin and Yongqiang Chen and Ludwig Schmidt and Bo Han and Tong Zhang},
  booktitle   = {The Thirty-eighth Annual Conference on Neural Information Processing Systems},
  title       = {A Sober Look at the Robustness of {CLIP}s to Spurious Features},
  eprint      = {2403.11497},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Wang2024f - A Sober Look at the Robustness of CLIPs to Spurious Features.pdf:PDF:http\://arxiv.org/pdf/2403.11497v2},
  groups      = {ForAug},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  priority    = {prio2},
  year        = {2024},
}

@Article{Aghagolzadeh2025,
  author      = {Aghagolzadeh, Hossein and Ezoji, Mehdi},
  date        = {2025-02-01},
  title       = {Contrastive Forward-Forward: A Training Algorithm of Vision Transformer},
  doi         = {10.48550/ARXIV.2502.00571},
  eprint      = {2502.00571},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Aghagolzadeh2025 - Contrastive Forward Forward_ a Training Algorithm of Vision Transformer.pdf:PDF:http\://arxiv.org/pdf/2502.00571v1},
  groups      = {Reading Group Potential},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Qin2024,
  author      = {Qin, Tian and Deng, Zhiwei and Alvarez-Melis, David},
  date        = {2024-06-15},
  title       = {A Label is Worth a Thousand Images in Dataset Distillation},
  doi         = {10.48550/ARXIV.2406.10485},
  eprint      = {2406.10485},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  abstract    = {Data $\textit{quality}$ is a crucial factor in the performance of machine learning models, a principle that dataset distillation methods exploit by compressing training datasets into much smaller counterparts that maintain similar downstream performance. Understanding how and why data distillation methods work is vital not only for improving these methods but also for revealing fundamental characteristics of "good" training data. However, a major challenge in achieving this goal is the observation that distillation approaches, which rely on sophisticated but mostly disparate methods to generate synthetic data, have little in common with each other. In this work, we highlight a largely overlooked aspect common to most of these methods: the use of soft (probabilistic) labels. Through a series of ablation experiments, we study the role of soft labels in depth. Our results reveal that the main factor explaining the performance of state-of-the-art distillation methods is not the specific techniques used to generate synthetic data but rather the use of soft labels. Furthermore, we demonstrate that not all soft labels are created equal; they must contain $\textit{structured information}$ to be beneficial. We also provide empirical scaling laws that characterize the effectiveness of soft labels as a function of images-per-class in the distilled dataset and establish an empirical Pareto frontier for data-efficient learning. Combined, our findings challenge conventional wisdom in dataset distillation, underscore the importance of soft labels in learning, and suggest new directions for improving distillation methods. Code for all experiments is available at https://github.com/sunnytqin/no-distillation.},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Qin2024 - A Label Is Worth a Thousand Images in Dataset Distillation.pdf:PDF:http\://arxiv.org/pdf/2406.10485v2},
  groups      = {Reading Group Potential},
  keywords    = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Kamboj2024,
  author      = {Kamboj, Abhi and Do, Minh},
  date        = {2024-03-17},
  title       = {A Survey of IMU Based Cross-Modal Transfer Learning in Human Activity Recognition},
  doi         = {10.48550/ARXIV.2403.15444},
  eprint      = {2403.15444},
  eprintclass = {eess.SP},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Kamboj2024 - A Survey of IMU Based Cross Modal Transfer Learning in Human Activity Recognition.pdf:PDF:http\://arxiv.org/pdf/2403.15444v1},
  keywords    = {Signal Processing (eess.SP), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Image and Video Processing (eess.IV), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@InProceedings{Xing2018,
  author    = {Xing, Tianwei and Sandha, Sandeep Singh and Balaji, Bharathan and Chakraborty, Supriyo and Srivastava, Mani},
  booktitle = {Proceedings of the 1st International Workshop on Edge Systems, Analytics and Networking},
  title     = {Enabling Edge Devices that Learn from Each Other: Cross Modal Training for Activity Recognition},
  doi       = {10.1145/3213344.3213351},
  isbn      = {9781450358378},
  location  = {Munich, Germany},
  pages     = {37–42},
  publisher = {Association for Computing Machinery},
  series    = {EdgeSys'18},
  url       = {https://doi.org/10.1145/3213344.3213351},
  address   = {New York, NY, USA},
  file      = {:Xing2018 - Enabling Edge Devices That Learn from Each Other_ Cross Modal Training for Activity Recognition.pdf:PDF},
  keywords  = {activity recognition, cross modality, edge devices, shared latent representation, transfer learning},
  numpages  = {6},
  priority  = {prio2},
  year      = {2018},
}

@InProceedings{Georgakis2022,
  author    = {Georgakis, Georgios and Schmeckpeper, Karl and Wanchoo, Karan and Dan, Soham and Miltsakaki, Eleni and Roth, Dan and Daniilidis, Kostas},
  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title     = {Cross-Modal Map Learning for Vision and Language Navigation},
  pages     = {15460-15470},
  file      = {:Georgakis2022 - Cross Modal Map Learning for Vision and Language Navigation.pdf:PDF},
  month     = {June},
  priority  = {prio2},
  year      = {2022},
}

@Article{Ma2024a,
  author      = {Ma, Wenxuan and Li, Shuang and Cai, Lincan and Kang, Jingxuan},
  date        = {2024-06-27},
  title       = {Learning Modality Knowledge Alignment for Cross-Modality Transfer},
  doi         = {10.48550/ARXIV.2406.18864},
  eprint      = {2406.18864},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Ma2024a - Learning Modality Knowledge Alignment for Cross Modality Transfer.pdf:PDF:http\://arxiv.org/pdf/2406.18864v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2024},
}

@InProceedings{Xue2023,
  author    = {Zihui Xue and Zhengqi Gao and Sucheng Ren and Hang Zhao},
  booktitle = {The Eleventh International Conference on Learning Representations},
  title     = {The Modality Focusing Hypothesis: Towards Understanding Crossmodal Knowledge Distillation},
  url       = {https://openreview.net/forum?id=w0QXrZ3N-s},
  file      = {:Xue2023 - The Modality Focusing Hypothesis_ Towards Understanding Crossmodal Knowledge Distillation.pdf:PDF},
  priority  = {prio1},
  year      = {2023},
}

@Article{Mansourian2025,
  author      = {Mansourian, Amir M. and Ahmadi, Rozhan and Ghafouri, Masoud and Babaei, Amir Mohammad and Golezani, Elaheh Badali and Ghamchi, Zeynab Yasamani and Ramezanian, Vida and Taherian, Alireza and Dinashi, Kimia and Miri, Amirali and Kasaei, Shohreh},
  date        = {2025-03-15},
  title       = {A Comprehensive Survey on Knowledge Distillation},
  doi         = {10.48550/ARXIV.2503.12067},
  eprint      = {2503.12067},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Mansourian2025 - A Comprehensive Survey on Knowledge Distillation.pdf:PDF:http\://arxiv.org/pdf/2503.12067v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@InProceedings{Peng2019,
  author    = {Peng, Baoyun and Jin, Xiao and Liu, Jiaheng and Li, Dongsheng and Wu, Yichao and Liu, Yu and Zhou, Shunfeng and Zhang, Zhaoning},
  booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title     = {Correlation Congruence for Knowledge Distillation},
  file      = {:Peng2019 - Correlation Congruence for Knowledge Distillation.pdf:PDF},
  month     = {October},
  priority  = {prio1},
  year      = {2019},
}

@InProceedings{Tung2019,
  author    = {Tung, Frederick and Mori, Greg},
  booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  title     = {Similarity-Preserving Knowledge Distillation},
  file      = {:Tung2019 - Similarity Preserving Knowledge Distillation.pdf:PDF},
  month     = {October},
  priority  = {prio1},
  year      = {2019},
}

@Article{Zhao2024,
  author      = {Zhao, Hongbo and Ni, Bolin and Wang, Haochen and Fan, Junsong and Zhu, Fei and Wang, Yuxi and Chen, Yuntao and Meng, Gaofeng and Zhang, Zhaoxiang},
  date        = {2024-03-18},
  title       = {Continual Forgetting for Pre-trained Vision Models},
  doi         = {10.48550/ARXIV.2403.11530},
  eprint      = {2403.11530},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhao2024 - Continual Forgetting for Pre Trained Vision Models.pdf:PDF:http\://arxiv.org/pdf/2403.11530v2},
  groups      = {Reading Group Potential},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Shani2025,
  author      = {Shani, Chen and Jurafsky, Dan and LeCun, Yann and Shwartz-Ziv, Ravid},
  date        = {2025-05-21},
  title       = {From Tokens to Thoughts: How LLMs and Humans Trade Compression for Meaning},
  doi         = {10.48550/ARXIV.2505.17117},
  eprint      = {2505.17117},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Shani2025 - From Tokens to Thoughts_ How LLMs and Humans Trade Compression for Meaning.pdf:PDF:http\://arxiv.org/pdf/2505.17117v3},
  groups      = {Reading Group Potential},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Information Theory (cs.IT), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Huh2024,
  author      = {Huh, Minyoung and Cheung, Brian and Wang, Tongzhou and Isola, Phillip},
  date        = {2024-05-13},
  title       = {The Platonic Representation Hypothesis},
  doi         = {10.48550/ARXIV.2405.07987},
  eprint      = {2405.07987},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Huh2024 - The Platonic Representation Hypothesis.pdf:PDF:http\://arxiv.org/pdf/2405.07987v5},
  groups      = {Reading Group Potential},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computer Vision and Pattern Recognition (cs.CV), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Dohmatob2024,
  author       = {Dohmatob, Elvis and Feng, Yunzhen and Yang, Pu and Charton, Francois and Kempe, Julia},
  date         = {2024-02-10},
  journaltitle = {ICML 2024},
  title        = {A Tale of Tails: Model Collapse as a Change of Scaling Laws},
  doi          = {10.48550/ARXIV.2402.07043},
  eprint       = {2402.07043},
  eprintclass  = {cs.LG},
  eprinttype   = {arXiv},
  abstract     = {As AI model size grows, neural scaling laws have become a crucial tool to predict the improvements of large models when increasing capacity and the size of original (human or natural) training data. Yet, the widespread use of popular models means that the ecosystem of online data and text will co-evolve to progressively contain increased amounts of synthesized data. In this paper we ask: How will the scaling laws change in the inevitable regime where synthetic data makes its way into the training corpus? Will future models, still improve, or be doomed to degenerate up to total (model) collapse? We develop a theoretical framework of model collapse through the lens of scaling laws. We discover a wide range of decay phenomena, analyzing loss of scaling, shifted scaling with number of generations, the ''un-learning" of skills, and grokking when mixing human and synthesized data. Our theory is validated by large-scale experiments with a transformer on an arithmetic task and text generation using the large language model Llama2.},
  copyright    = {arXiv.org perpetual, non-exclusive license},
  file         = {:Dohmatob2024 - A Tale of Tails_ Model Collapse As a Change of Scaling Laws.pdf:PDF:http\://arxiv.org/pdf/2402.07043v2},
  groups       = {Reading Group Potential},
  keywords     = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), FOS: Computer and information sciences},
  publisher    = {arXiv},
  year         = {2024},
}

@Article{Morawiecki2022,
  author      = {Morawiecki, Paweł and Krutsylo, Andrii and Wołczyk, Maciej and Śmieja, Marek},
  date        = {2022-06-28},
  title       = {Hebbian Continual Representation Learning},
  doi         = {10.48550/ARXIV.2207.04874},
  eprint      = {2207.04874},
  eprintclass = {cs.NE},
  eprinttype  = {arXiv},
  abstract    = {Continual Learning aims to bring machine learning into a more realistic scenario, where tasks are learned sequentially and the i.i.d. assumption is not preserved. Although this setting is natural for biological systems, it proves very difficult for machine learning models such as artificial neural networks. To reduce this performance gap, we investigate the question whether biologically inspired Hebbian learning is useful for tackling continual challenges. In particular, we highlight a realistic and often overlooked unsupervised setting, where the learner has to build representations without any supervision. By combining sparse neural networks with Hebbian learning principle, we build a simple yet effective alternative (HebbCL) to typical neural network models trained via the gradient descent. Due to Hebbian learning, the network have easily interpretable weights, which might be essential in critical application such as security or healthcare. We demonstrate the efficacy of HebbCL in an unsupervised learning setting applied to MNIST and Omniglot datasets. We also adapt the algorithm to the supervised scenario and obtain promising results in the class-incremental learning.},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Morawiecki2022 - Hebbian Continual Representation Learning.pdf:PDF:http\://arxiv.org/pdf/2207.04874v1},
  groups      = {Reading Group Potential},
  keywords    = {Neural and Evolutionary Computing (cs.NE), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@Article{Mason2025,
  author      = {Mason, Sebastian Ray and Gjølbye, Anders and Højbjerg, Phillip Chavarria and Tětková, Lenka and Hansen, Lars Kai},
  date        = {2025-09-18},
  title       = {Large Vision Models Can Solve Mental Rotation Problems},
  doi         = {10.48550/ARXIV.2509.15271},
  eprint      = {2509.15271},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {Mental rotation is a key test of spatial reasoning in humans and has been central to understanding how perception supports cognition. Despite the success of modern vision transformers, it is still unclear how well these models develop similar abilities. In this work, we present a systematic evaluation of ViT, CLIP, DINOv2, and DINOv3 across a range of mental-rotation tasks, from simple block structures similar to those used by Shepard and Metzler to study human cognition, to more complex block figures, three types of text, and photo-realistic objects. By probing model representations layer by layer, we examine where and how these networks succeed. We find that i) self-supervised ViTs capture geometric structure better than supervised ViTs; ii) intermediate layers perform better than final layers; iii) task difficulty increases with rotation complexity and occlusion, mirroring human reaction times and suggesting similar constraints in embedding space representations.},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Mason2025 - Large Vision Models Can Solve Mental Rotation Problems.pdf:PDF:http\://arxiv.org/pdf/2509.15271v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Helbling2025,
  author      = {Helbling, Alec and Meral, Tuna Han Salih and Hoover, Ben and Yanardag, Pinar and Chau, Duen Horng},
  date        = {2025-02-06},
  title       = {ConceptAttention: Diffusion Transformers Learn Highly Interpretable Features},
  doi         = {10.48550/ARXIV.2502.04320},
  eprint      = {2502.04320},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {Do the rich representations of multi-modal diffusion transformers (DiTs) exhibit unique properties that enhance their interpretability? We introduce ConceptAttention, a novel method that leverages the expressive power of DiT attention layers to generate high-quality saliency maps that precisely locate textual concepts within images. Without requiring additional training, ConceptAttention repurposes the parameters of DiT attention layers to produce highly contextualized concept embeddings, contributing the major discovery that performing linear projections in the output space of DiT attention layers yields significantly sharper saliency maps compared to commonly used cross-attention maps. ConceptAttention even achieves state-of-the-art performance on zero-shot image segmentation benchmarks, outperforming 15 other zero-shot interpretability methods on the ImageNet-Segmentation dataset. ConceptAttention works for popular image models and even seamlessly generalizes to video generation. Our work contributes the first evidence that the representations of multi-modal DiTs are highly transferable to vision tasks like segmentation.},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Helbling2025 - ConceptAttention_ Diffusion Transformers Learn Highly Interpretable Features.pdf:PDF:http\://arxiv.org/pdf/2502.04320v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Simeoni2025,
  author      = {Siméoni, Oriane and Vo, Huy V. and Seitzer, Maximilian and Baldassarre, Federico and Oquab, Maxime and Jose, Cijo and Khalidov, Vasil and Szafraniec, Marc and Yi, Seungeun and Ramamonjisoa, Michaël and Massa, Francisco and Haziza, Daniel and Wehrstedt, Luca and Wang, Jianyuan and Darcet, Timothée and Moutakanni, Théo and Sentana, Leonel and Roberts, Claire and Vedaldi, Andrea and Tolan, Jamie and Brandt, John and Couprie, Camille and Mairal, Julien and Jégou, Hervé and Labatut, Patrick and Bojanowski, Piotr},
  date        = {2025-08-13},
  title       = {DINOv3},
  doi         = {10.48550/ARXIV.2508.10104},
  eprint      = {2508.10104},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {Self-supervised learning holds the promise of eliminating the need for manual data annotation, enabling models to scale effortlessly to massive datasets and larger architectures. By not being tailored to specific tasks or domains, this training paradigm has the potential to learn visual representations from diverse sources, ranging from natural to aerial images -- using a single algorithm. This technical report introduces DINOv3, a major milestone toward realizing this vision by leveraging simple yet effective strategies. First, we leverage the benefit of scaling both dataset and model size by careful data preparation, design, and optimization. Second, we introduce a new method called Gram anchoring, which effectively addresses the known yet unsolved issue of dense feature maps degrading during long training schedules. Finally, we apply post-hoc strategies that further enhance our models' flexibility with respect to resolution, model size, and alignment with text. As a result, we present a versatile vision foundation model that outperforms the specialized state of the art across a broad range of settings, without fine-tuning. DINOv3 produces high-quality dense features that achieve outstanding performance on various vision tasks, significantly surpassing previous self- and weakly-supervised foundation models. We also share the DINOv3 suite of vision models, designed to advance the state of the art on a wide spectrum of tasks and data by providing scalable solutions for diverse resource constraints and deployment scenarios.},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Simeoni2025 - DINOv3.pdf:PDF:http\://arxiv.org/pdf/2508.10104v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Zhou2022a,
  author      = {Zhou, Minghao and Wang, Quanziang and Shu, Jun and Zhao, Qian and Meng, Deyu},
  date        = {2022-02-16},
  title       = {Diagnosing Batch Normalization in Class Incremental Learning},
  doi         = {10.48550/ARXIV.2202.08025},
  eprint      = {2202.08025},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  abstract    = {Extensive researches have applied deep neural networks (DNNs) in class incremental learning (Class-IL). As building blocks of DNNs, batch normalization (BN) standardizes intermediate feature maps and has been widely validated to improve training stability and convergence. However, we claim that the direct use of standard BN in Class-IL models is harmful to both the representation learning and the classifier training, thus exacerbating catastrophic forgetting. In this paper we investigate the influence of BN on Class-IL models by illustrating such BN dilemma. We further propose BN Tricks to address the issue by training a better feature extractor while eliminating classification bias. Without inviting extra hyperparameters, we apply BN Tricks to three baseline rehearsal-based methods, ER, DER++ and iCaRL. Through comprehensive experiments conducted on benchmark datasets of Seq-CIFAR-10, Seq-CIFAR-100 and Seq-Tiny-ImageNet, we show that BN Tricks can bring significant performance gains to all adopted baselines, revealing its potential generality along this line of research.},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhou2022a - Diagnosing Batch Normalization in Class Incremental Learning.pdf:PDF:http\://arxiv.org/pdf/2202.08025v1},
  groups      = {Reading Group Potential},
  keywords    = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@Article{Kosowski2025,
  author      = {Kosowski, Adrian and Uznański, Przemysław and Chorowski, Jan and Stamirowska, Zuzanna and Bartoszkiewicz, Michał},
  date        = {2025-09-30},
  title       = {The Dragon Hatchling: The Missing Link between the Transformer and Models of the Brain},
  doi         = {10.48550/ARXIV.2509.26507},
  eprint      = {2509.26507},
  eprintclass = {cs.NE},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Kosowski2025 - The Dragon Hatchling_ the Missing Link between the Transformer and Models of the Brain.pdf:PDF:http\://arxiv.org/pdf/2509.26507v1},
  keywords    = {Neural and Evolutionary Computing (cs.NE), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Ruiz2020,
  author      = {Ruiz, Alejandro Hernandez and Vilalta, Armand and Moreno-Noguer, Francesc},
  date        = {2020-06-22},
  title       = {Neural Cellular Automata Manifold},
  doi         = {10.48550/ARXIV.2006.12155},
  eprint      = {2006.12155},
  eprintclass = {cs.NE},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Ruiz2020 - Neural Cellular Automata Manifold.pdf:PDF:http\://arxiv.org/pdf/2006.12155v3},
  groups      = {Reading Group Potential},
  keywords    = {Neural and Evolutionary Computing (cs.NE), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2020},
}

@InProceedings{Rahat2025,
  author    = {Rahat, Fazle and Hossain, M Shifat and Ahmed, Md Rubel and Jha, Sumit Kumar and Ewetz, Rickard},
  booktitle = {Proceedings of the Winter Conference on Applications of Computer Vision (WACV)},
  title     = {Data Augmentation for Image Classification using Generative AI},
  pages     = {4173-4182},
  file      = {:Rahat2025 - Data Augmentation for Image Classification Using Generative AI.pdf:PDF},
  month     = {February},
  priority  = {prio1},
  year      = {2025},
}

@Article{Abdullaev2025,
  author      = {Abdullaev, Laziz U. and Tkachenko, Maksim and Nguyen, Tan M.},
  date        = {2025-06-12},
  title       = {Revisiting Transformers with Insights from Image Filtering},
  doi         = {10.48550/ARXIV.2506.10371},
  eprint      = {2506.10371},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Abdullaev2025 - Revisiting Transformers with Insights from Image Filtering.pdf:PDF:http\://arxiv.org/pdf/2506.10371v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Jiang2025,
  author      = {Jiang, Nick and Dravid, Amil and Efros, Alexei and Gandelsman, Yossi},
  date        = {2025-06-09},
  title       = {Vision Transformers Don't Need Trained Registers},
  doi         = {10.48550/ARXIV.2506.08010},
  eprint      = {2506.08010},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Jiang2025 - Vision Transformers Don't Need Trained Registers.pdf:PDF:http\://arxiv.org/pdf/2506.08010v4},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Dong2025,
  author      = {Dong, Yihe and Noci, Lorenzo and Khodak, Mikhail and Li, Mufan},
  date        = {2025-06-01},
  title       = {Is Random Attention Sufficient for Sequence Modeling? Disentangling Trainable Components in the Transformer},
  doi         = {10.48550/ARXIV.2506.01115},
  eprint      = {2506.01115},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Dong2025 - Is Random Attention Sufficient for Sequence Modeling_ Disentangling Trainable Components in the Transformer.pdf:PDF:http\://arxiv.org/pdf/2506.01115v3},
  groups      = {Reading Group Potential},
  keywords    = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Nadeem2025,
  author      = {Nadeem, Numair and Anwar, Saeed and Asad, Muhammad Hamza and Bais, Abdul},
  date        = {2025-06-16},
  title       = {HVL: Semi-Supervised Segmentation leveraging Hierarchical Vision-Language Synergy with Dynamic Text-Spatial Query Alignment},
  doi         = {10.48550/ARXIV.2506.13925},
  eprint      = {2506.13925},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Nadeem2025 - HVL_ Semi Supervised Segmentation Leveraging Hierarchical Vision Language Synergy with Dynamic Text Spatial Query Alignment.pdf:PDF:http\://arxiv.org/pdf/2506.13925v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Li2025a,
  author      = {Li, Tianqin and Wen, Ziqi and Song, Leiran and Liu, Jun and Jing, Zhi and Lee, Tai Sing},
  date        = {2025-05-31},
  title       = {From Local Cues to Global Percepts: Emergent Gestalt Organization in Self-Supervised Vision Models},
  doi         = {10.48550/ARXIV.2506.00718},
  eprint      = {2506.00718},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Li2025a - From Local Cues to Global Percepts_ Emergent Gestalt Organization in Self Supervised Vision Models.pdf:PDF:http\://arxiv.org/pdf/2506.00718v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@InProceedings{Peng2025,
  author    = {Peng, Zelin and Huang, Yu and Xu, Zhengqin and Tang, Feilong and Hu, Ming and Yang, Xiaokang and Shen, Wei},
  booktitle = {2025 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title     = {Star with Bilinear Mapping},
  doi       = {10.1109/CVPR52734.2025.02355},
  pages     = {25292-25302},
  file      = {:Peng2025 - Star with Bilinear Mapping.pdf:PDF},
  keywords  = {Computer vision;Computational modeling;Semantic segmentation;Stars;Computer architecture;Transformers;Complexity theory;Computational efficiency;Context modeling;Image classification},
  priority  = {prio3},
  year      = {2025},
}

@Article{Kuzucu2025,
  author      = {Kuzucu, Selim and Naeem, Muhammad Ferjad and Kukleva, Anna and Tombari, Federico and Schiele, Bernt},
  date        = {2025-07-01},
  title       = {Language-Unlocked ViT (LUViT): Empowering Self-Supervised Vision Transformers with LLMs},
  doi         = {10.48550/ARXIV.2507.00754},
  eprint      = {2507.00754},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
  file        = {:Kuzucu2025 - Language Unlocked ViT (LUViT)_ Empowering Self Supervised Vision Transformers with LLMs.pdf:PDF:http\://arxiv.org/pdf/2507.00754v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Hanna2025,
  author      = {Hanna, Joelle and Borth, Damian},
  date        = {2025-07-09},
  title       = {Know Your Attention Maps: Class-specific Token Masking for Weakly Supervised Semantic Segmentation},
  doi         = {10.48550/ARXIV.2507.06848},
  eprint      = {2507.06848},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Hanna2025 - Know Your Attention Maps_ Class Specific Token Masking for Weakly Supervised Semantic Segmentation.pdf:PDF:http\://arxiv.org/pdf/2507.06848v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Bai2025,
  author      = {Bai, Xiaoyan and Pres, Itamar and Deng, Yuntian and Tan, Chenhao and Shieber, Stuart and Viégas, Fernanda and Wattenberg, Martin and Lee, Andrew},
  date        = {2025-09-30},
  title       = {Why Can't Transformers Learn Multiplication? Reverse-Engineering Reveals Long-Range Dependency Pitfalls},
  doi         = {10.48550/ARXIV.2510.00184},
  eprint      = {2510.00184},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Bai2025 - Why Can't Transformers Learn Multiplication_ Reverse Engineering Reveals Long Range Dependency Pitfalls.pdf:PDF:http\://arxiv.org/pdf/2510.00184v1},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Yu2025,
  author      = {Yu, Ruoxi and Jiang, Haotian and Cheng, Jingpu and Yu, Penghao and Li, Qianxiao and Li, Zhong},
  date        = {2025-10-04},
  title       = {Allocation of Parameters in Transformers},
  doi         = {10.48550/ARXIV.2510.03784},
  eprint      = {2510.03784},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Yu2025 - Allocation of Parameters in Transformers.pdf:PDF:http\://arxiv.org/pdf/2510.03784v1},
  keywords    = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences},
  priority    = {prio2},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Jain2025,
  author       = {Jain, Siddharth and Karthik, Shyamgopal and Gandhi, Vineet},
  date         = {2025-10-25},
  journaltitle = {Transactions on Machine Learning Research (TMLR), 2025},
  title        = {Simplifying Knowledge Transfer in Pretrained Models},
  doi          = {10.48550/ARXIV.2510.22208},
  eprint       = {2510.22208},
  eprintclass  = {cs.LG},
  eprinttype   = {arXiv},
  copyright    = {Creative Commons Attribution 4.0 International},
  file         = {:Jain2025 - Simplifying Knowledge Transfer in Pretrained Models.pdf:PDF:http\://arxiv.org/pdf/2510.22208v1},
  keywords     = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  priority     = {prio2},
  publisher    = {arXiv},
  year         = {2025},
}

@Article{Liu2025a,
  author      = {Liu, Yiming and Zhang, Yuhui and Ghosh, Dhruba and Schmidt, Ludwig and Yeung-Levy, Serena},
  date        = {2025-10-13},
  title       = {Data or Language Supervision: What Makes CLIP Better than DINO?},
  doi         = {10.48550/ARXIV.2510.11835},
  eprint      = {2510.11835},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Liu2025a - Data or Language Supervision_ What Makes CLIP Better Than DINO_.pdf:PDF:http\://arxiv.org/pdf/2510.11835v1},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), Multimedia (cs.MM), FOS: Computer and information sciences},
  priority    = {prio1},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Qiu2025,
  author      = {Qiu, Haiquan and Yao, Quanming},
  date        = {2025-10-05},
  title       = {Why Low-Precision Transformer Training Fails: An Analysis on Flash Attention},
  doi         = {10.48550/ARXIV.2510.04212},
  eprint      = {2510.04212},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Qiu2025 - Why Low Precision Transformer Training Fails_ an Analysis on Flash Attention.pdf:PDF:http\://arxiv.org/pdf/2510.04212v2},
  keywords    = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  priority    = {prio3},
  publisher   = {arXiv},
  year        = {2025},
}

@InProceedings{Radford2018,
  author = {Alec Radford and Karthik Narasimhan},
  title  = {Improving Language Understanding by Generative Pre-Training},
  file   = {:Radford2018 - Improving Language Understanding by Generative Pre Training.pdf:PDF},
  year   = {2018},
}

@Article{Touvron2023,
  author      = {Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timothée and Rozière, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and Rodriguez, Aurelien and Joulin, Armand and Grave, Edouard and Lample, Guillaume},
  date        = {2023-02-27},
  title       = {LLaMA: Open and Efficient Foundation Language Models},
  doi         = {10.48550/ARXIV.2302.13971},
  eprint      = {2302.13971},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:http\://arxiv.org/pdf/2302.13971v1:PDF},
  keywords    = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2023},
}

@Article{Grattafiori2024,
  author      = {Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Vaughan, Alex and Yang, Amy and Fan, Angela and Goyal, Anirudh and Hartshorn, Anthony and Yang, Aobo and Mitra, Archi and Sravankumar, Archie and Korenev, Artem and Hinsvark, Arthur and Rao, Arun and Zhang, Aston and Rodriguez, Aurelien and Gregerson, Austen and Spataru, Ava},
  date        = {2024-07-31},
  title       = {The Llama 3 Herd of Models},
  doi         = {10.48550/ARXIV.2407.21783},
  eprint      = {2407.21783},
  eprintclass = {cs.AI},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Grattafiori2024 - The Llama 3 Herd of Models.pdf:PDF:http\://arxiv.org/pdf/2407.21783v3},
  keywords    = {Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Jia2021,
  author       = {Jia, Chao and Yang, Yinfei and Xia, Ye and Chen, Yi-Ting and Parekh, Zarana and Pham, Hieu and Le, Quoc V. and Sung, Yunhsuan and Li, Zhen and Duerig, Tom},
  date         = {2021-02-11},
  journaltitle = {International Conference on Machine Learning 2021},
  title        = {Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision},
  doi          = {10.48550/ARXIV.2102.05918},
  eprint       = {2102.05918},
  eprintclass  = {cs.CV},
  eprinttype   = {arXiv},
  copyright    = {arXiv.org perpetual, non-exclusive license},
  file         = {:Jia2021 - Scaling up Visual and Vision Language Representation Learning with Noisy Text Supervision.pdf:PDF:http\://arxiv.org/pdf/2102.05918v2},
  keywords     = {Computer Vision and Pattern Recognition (cs.CV), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher    = {arXiv},
  year         = {2021},
}

@Article{Li2022c,
  author      = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
  date        = {2022-01-28},
  title       = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
  doi         = {10.48550/ARXIV.2201.12086},
  eprint      = {2201.12086},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Li2022c - BLIP_ Bootstrapping Language Image Pre Training for Unified Vision Language Understanding and Generation.pdf:PDF:http\://arxiv.org/pdf/2201.12086v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2022},
}

@Article{Thapa2024,
  author      = {Thapa, Rahul and Chen, Kezhen and Covert, Ian and Chalamala, Rahul and Athiwaratkun, Ben and Song, Shuaiwen Leon and Zou, James},
  date        = {2024-06-03},
  title       = {Dragonfly: Multi-Resolution Zoom-In Encoding Enhances Vision-Language Models},
  doi         = {10.48550/ARXIV.2406.00977},
  eprint      = {2406.00977},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Thapa2024 - Dragonfly_ Multi Resolution Zoom in Encoding Enhances Vision Language Models.pdf:PDF:http\://arxiv.org/pdf/2406.00977v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Beyer2024,
  author      = {Beyer, Lucas and Steiner, Andreas and Pinto, André Susano and Kolesnikov, Alexander and Wang, Xiao and Salz, Daniel and Neumann, Maxim and Alabdulmohsin, Ibrahim and Tschannen, Michael and Bugliarello, Emanuele and Unterthiner, Thomas and Keysers, Daniel and Koppula, Skanda and Liu, Fangyu and Grycner, Adam and Gritsenko, Alexey and Houlsby, Neil and Kumar, Manoj and Rong, Keran and Eisenschlos, Julian and Kabra, Rishabh and Bauer, Matthias and Bošnjak, Matko and Chen, Xi and Minderer, Matthias and Voigtlaender, Paul and Bica, Ioana and Balazevic, Ivana and Puigcerver, Joan and Papalampidi, Pinelopi and Henaff, Olivier and Xiong, Xi and Soricut, Radu and Harmsen, Jeremiah and Zhai, Xiaohua},
  date        = {2024-07-10},
  title       = {PaliGemma: A versatile 3B VLM for transfer},
  doi         = {10.48550/ARXIV.2407.07726},
  eprint      = {2407.07726},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Beyer2024 - PaliGemma_ a Versatile 3B VLM for Transfer.pdf:PDF:http\://arxiv.org/pdf/2407.07726v2},
  keywords    = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2024},
}

@Article{Yang2025,
  author      = {Yang, An and Li, Anfeng and Yang, Baosong and Zhang, Beichen and Hui, Binyuan and Zheng, Bo and Yu, Bowen and Gao, Chang and Huang, Chengen and Lv, Chenxu and Zheng, Chujie and Liu, Dayiheng and Zhou, Fan and Huang, Fei and Hu, Feng and Ge, Hao and Wei, Haoran and Lin, Huan and Tang, Jialong and Yang, Jian and Tu, Jianhong and Zhang, Jianwei and Yang, Jianxin and Yang, Jiaxi and Zhou, Jing and Zhou, Jingren and Lin, Junyang and Dang, Kai and Bao, Keqin and Yang, Kexin and Yu, Le and Deng, Lianghao and Li, Mei and Xue, Mingfeng and Li, Mingze and Zhang, Pei and Wang, Peng and Zhu, Qin and Men, Rui and Gao, Ruize and Liu, Shixuan and Luo, Shuang and Li, Tianhao and Tang, Tianyi and Yin, Wenbiao and Ren, Xingzhang and Wang, Xinyu and Zhang, Xinyu and Ren, Xuancheng and Fan, Yang and Su, Yang and Zhang, Yichang and Zhang, Yinger and Wan, Yu and Liu, Yuqiong and Wang, Zekun and Cui, Zeyu and Zhang, Zhenru and Zhou, Zhipeng and Qiu, Zihan},
  date        = {2025-05-14},
  title       = {Qwen3 Technical Report},
  doi         = {10.48550/ARXIV.2505.09388},
  eprint      = {2505.09388},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Yang2025 - Qwen3 Technical Report.pdf:PDF},
  keywords    = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Zhang2025a,
  author      = {Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},
  date        = {2025-06-05},
  title       = {Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},
  doi         = {10.48550/ARXIV.2506.05176},
  eprint      = {2506.05176},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {arXiv.org perpetual, non-exclusive license},
  file        = {:Zhang2025a - Qwen3 Embedding_ Advancing Text Embedding and Reranking through Foundation Models.pdf:PDF:http\://arxiv.org/pdf/2506.05176v3},
  keywords    = {Computation and Language (cs.CL), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2025},
}

@Article{Lee2024b,
  author      = {Lee, Chankyu and Roy, Rajarshi and Xu, Mengyao and Raiman, Jonathan and Shoeybi, Mohammad and Catanzaro, Bryan and Ping, Wei},
  date        = {2024-05-27},
  title       = {NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding Models},
  doi         = {10.48550/ARXIV.2405.17428},
  eprint      = {2405.17428},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  copyright   = {Creative Commons Attribution 4.0 International},
  file        = {:Lee2024b - NV Embed_ Improved Techniques for Training LLMs As Generalist Embedding Models.pdf:PDF:http\://arxiv.org/pdf/2405.17428v3},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Information Retrieval (cs.IR), Machine Learning (cs.LG), FOS: Computer and information sciences},
  publisher   = {arXiv},
  year        = {2024},
}

@InProceedings{Cai2021,
  author    = {Xingyu Cai and Jiaji Huang and Yuchen Bian and Kenneth Church},
  booktitle = {International Conference on Learning Representations},
  title     = {Isotropy in the Contextual Embedding Space: Clusters and Manifolds},
  url       = {https://openreview.net/forum?id=xYGNO86OWDH},
  file      = {:Cai2021 - Isotropy in the Contextual Embedding Space_ Clusters and Manifolds.pdf:PDF},
  year      = {2021},
}

@Article{Razzhigaev2023,
  author      = {Razzhigaev, Anton and Mikhalchuk, Matvey and Goncharova, Elizaveta and Oseledets, Ivan and Dimitrov, Denis and Kuznetsov, Andrey},
  date        = {2023-11-10},
  title       = {The Shape of Learning: Anisotropy and Intrinsic Dimensions in Transformer-Based Models},
  doi         = {10.48550/ARXIV.2311.05928},
  eprint      = {2311.05928},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  abstract    = {In this study, we present an investigation into the anisotropy dynamics and intrinsic dimension of embeddings in transformer architectures, focusing on the dichotomy between encoders and decoders. Our findings reveal that the anisotropy profile in transformer decoders exhibits a distinct bell-shaped curve, with the highest anisotropy concentrations in the middle layers. This pattern diverges from the more uniformly distributed anisotropy observed in encoders. In addition, we found that the intrinsic dimension of embeddings increases in the initial phases of training, indicating an expansion into higher-dimensional space. Which is then followed by a compression phase towards the end of training with dimensionality decrease, suggesting a refinement into more compact representations. Our results provide fresh insights to the understanding of encoders and decoders embedding properties.},
  copyright   = {Creative Commons Zero v1.0 Universal},
  file        = {:Razzhigaev2023 - The Shape of Learning_ Anisotropy and Intrinsic Dimensions in Transformer Based Models.pdf:PDF:http\://arxiv.org/pdf/2311.05928v2},
  keywords    = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Information Theory (cs.IT), Machine Learning (cs.LG), General Topology (math.GN), FOS: Computer and information sciences, FOS: Mathematics},
  publisher   = {arXiv},
  year        = {2023},
}

@Comment{jabref-meta: databaseType:biblatex;}

@Comment{jabref-meta: fileDirectoryLatex-tobias-port-4114:/home/nauen/cloud/JobDFKI;}

@Comment{jabref-meta: fileDirectoryLatex-tobias-tobias-MS-7C37:/data/cloud/JobDFKI;}

@Comment{jabref-meta: grouping:
0 AllEntriesGroup:;
1 StaticGroup:Datasets\;0\;0\;0xff00ffff\;\;\;;
1 StaticGroup:Coreset for FL\;0\;1\;0xe5e500ff\;\;\;;
1 StaticGroup:Dataset Distillation Survey\;0\;0\;0x003333ff\;\;Citations for our distillation survey paper\;;
2 StaticGroup:Condensed Dataset\;0\;0\;0x8a8a8aff\;\;\;;
2 StaticGroup:Importance Sampling\;0\;1\;0x8a8a8aff\;\;\;;
2 StaticGroup:Pruning\;0\;1\;0x8a8a8aff\;\;\;;
2 StaticGroup:Noisy Labels\;0\;1\;0x8a8a8aff\;\;\;;
2 StaticGroup:Surveys\;0\;1\;0x8a8a8aff\;\;\;;
1 StaticGroup:WTF Benchmark\;0\;1\;0xb31a1aff\;\;\;;
1 StaticGroup:Reading Group Potential\;0\;0\;0x1a3399ff\;\;\;;
1 StaticGroup:ForAug\;0\;1\;0xcc6633ff\;\;\;;
}