@unpublished{SpravilHoubenBehnke2024,
  author  = {Julian Spravil and Sebastian Houben and Sven Behnke},
  title   = {HyenaPixel: Global Image Context with Convolutions},
  institution = {Fachbereich Informatik},
  pages     = {13},
  year    = {2024},
  abstract  = {In vision tasks, a larger effective receptive field (ERF) is associated with better performance. While attention natively supports global context, convolution requires multiple stacked layers and a hierarchical structure for large context. In this work, we extend Hyena, a convolution-based attention replacement, from causal sequences to the non-causal two-dimensional image space. We scale the Hyena convolution kernels beyond the feature map size up to 191\$\times\$191 to maximize the ERF while maintaining sub-quadratic complexity in the number of pixels. We integrate our two-dimensional Hyena, HyenaPixel, and bidirectional Hyena into the MetaFormer framework. For image categorization, HyenaPixel and bidirectional Hyena achieve a competitive ImageNet-1k top-1 accuracy of 83.0\% and 83.5\%, respectively, while outperforming other large-kernel networks. Combining HyenaPixel with attention further increases accuracy to 83.6\%. We attribute the success of attention to the lack of spatial bias in later stages and support this finding with bidirectional Hyena.},
  language  = {en}
}