Commits (5)
......@@ -2,6 +2,17 @@
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
......@@ -5,9 +5,8 @@ import torch.nn
import numpy as np
import opt_einsum as oe
from jpeg_codec import D_n, Z, S_i, encode, decode
from scipy.misc import imresize
device = torch.device('cuda')
device = torch.device('cpu')
class AppxReLU(torch.nn.modules.Module):
......@@ -77,9 +76,10 @@ for f in range(15):
apx_relu = appx_relu(im_jpeg)
annm_im = decode(annm_relu, device=device)
apx_im = apx_relu.view(-1, 1, 8, 8)
annm_errors[f] += rmse_error(annm_im, true_relu)
appx_errors[f] += rmse_error(apx_relu, true_relu)
appx_errors[f] += rmse_error(apx_im, true_relu)
annm_errors /= args.batches * args.batch_size
appx_errors /= args.batches * args.batch_size
title={Vision and video: models and applications},
author={Winkler, Stefan and Kunt, Murat and van den Branden Lambrecht, Christian J},
booktitle={Vision Models and Applications to Image and Video Processing},
title={The JPEG still picture compression standard},
author={Wallace, Gregory K},
journal={IEEE transactions on consumer electronics},
title={Fast software processing of motion JPEG video},
author={Smith, B},
booktitle={Proceedings of the second ACM international conference on Multimedia},
title={Video Compositing in the DCT domain},
author={Chang, S-F},
booktitle={IEEE Workshop on Visual Signal Processing and Communications, Raleigh, NC, Sep. 1992},
title={Inner-block operations on compressed images},
author={Shen, Bo and Sethi, Ishwar K},
booktitle={Proceedings of the third ACM international conference on Multimedia},
title={A fast approximate algorithm for scaling down digital images in the DCT domain},
author={Natarajan, Balas K and Vasudev, Bhaskaran},
booktitle={Image Processing, 1995. Proceedings., International Conference on},
title={Algorithms for manipulating compressed images},
author={Smith, Brian C and Rowe, Lawrence A},
journal={IEEE Computer Graphics and Applications},
title={Direct feature extraction from compressed images},
author={Shen, Bo and Sethi, Ishwar K},
booktitle={Storage and Retrieval for Still Image and Video Databases IV},
organization={International Society for Optics and Photonics}
title={A new approach to decoding and compositing motion-compensated DCT-based images},
author={Chang, Shih-Fu and Messerschmitt, David G},
title={Block-based manipulations on transform-compressed images and videos},
author={Shen, Bo and Sethi, Ishwar K},
journal={Multimedia Systems},
title={Batch normalization: Accelerating deep network training by reducing internal covariate shift},
author={Ioffe, Sergey and Szegedy, Christian},
journal={arXiv preprint arXiv:1502.03167},
title={Deep residual learning for image recognition},
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
title={Automatic differentiation in PyTorch},
author={Paszke, Adam and Gross, Sam and Chintala, Soumith and Chanan, Gregory and Yang, Edward and DeVito, Zachary and Lin, Zeming and Desmaison, Alban and Antiga, Luca and Lerer, Adam},
title={{TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
note={Software available from tensorflow.org},
Mart\'{\i}n~Abadi and
Ashish~Agarwal and
Paul~Barham and
Eugene~Brevdo and
Zhifeng~Chen and
Craig~Citro and
Greg~S.~Corrado and
Andy~Davis and
Jeffrey~Dean and
Matthieu~Devin and
Sanjay~Ghemawat and
Ian~Goodfellow and
Andrew~Harp and
Geoffrey~Irving and
Michael~Isard and
Yangqing Jia and
Rafal~Jozefowicz and
Lukasz~Kaiser and
Manjunath~Kudlur and
Josh~Levenberg and
Dan~Man\'{e} and
Rajat~Monga and
Sherry~Moore and
Derek~Murray and
Chris~Olah and
Mike~Schuster and
Jonathon~Shlens and
Benoit~Steiner and
Ilya~Sutskever and
Kunal~Talwar and
Paul~Tucker and
Vincent~Vanhoucke and
Vijay~Vasudevan and
Fernanda~Vi\'{e}gas and
Oriol~Vinyals and
Pete~Warden and
Martin~Wattenberg and
Martin~Wicke and
Yuan~Yu and
Author = {Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor},
Journal = {arXiv preprint arXiv:1408.5093},
Title = {Caffe: Convolutional Architecture for Fast Feature Embedding},
Year = {2014}
title={cudnn: Efficient primitives for deep learning},
author={Chetlur, Sharan and Woolley, Cliff and Vandermersch, Philippe and Cohen, Jonathan and Tran, John and Catanzaro, Bryan and Shelhamer, Evan},
journal={arXiv preprint arXiv:1410.0759},
title={Compressed video action recognition},
author={Wu, Chao-Yuan and Zaheer, Manzil and Hu, Hexiang and Manmatha, R and Smola, Alexander J and Kr{\"a}henb{\"u}hl, Philipp},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
title={Faster Neural Networks Straight from JPEG},
author={Lionel Gueguen and Alex Sergeev and Ben Kadlec and Rosanne Liu and Jason Yosinski},
booktitle={International Conference on Learning Representations},
title={Image processing on compressed data for large video databases},
author={Arman, Farshid and Hsu, Arding and Chiu, Ming-Yee},
booktitle={Proceedings of the first ACM international conference on Multimedia},
title={Efficient image retrieval in DCT domain by hypothesis testing},
author={He, Daan and Gu, Zhenmei and Cercone, Nick},
booktitle={Image Processing (ICIP), 2009 16th IEEE International Conference on},
title={JPEG image retrieval based on features from DCT domain},
author={Feng, Guocan and Jiang, Jianmin},
booktitle={International Conference on Image and Video Retrieval},
title={Deep feature extraction in the DCT domain},
author={Ghosh, Arthita and Chellappa, Rama},
booktitle={Pattern Recognition (ICPR), 2016 23rd International Conference on},
title={SIFT Feature Extraction Algorithm for Image in DCT Domain},
author={Wu, Zhen and Xu, Zhe and Zhang, Rui Nian and Li, Shao Mei},
booktitle={Applied Mechanics and Materials},
organization={Trans Tech Publ}
title={The MNIST database of handwritten digits},
author={LeCun, Yann},
journal={http://yann. lecun. com/exdb/mnist/}
title={Learning multiple layers of features from tiny images},
author={Krizhevsky, Alex and Hinton, Geoffrey},
title={U-net: Convolutional networks for biomedical image segmentation},
author={Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
booktitle={International Conference on Medical image computing and computer-assisted intervention},
title={Imagenet classification with deep convolutional neural networks},
author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
booktitle={Advances in neural information processing systems},
title={Deep learning applications and challenges in big data analytics},
author={Najafabadi, Maryam M and Villanustre, Flavio and Khoshgoftaar, Taghi M and Seliya, Naeem and Wald, Randall and Muharemagic, Edin},
journal={Journal of Big Data},
title={Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding},
author={Han, Song and Mao, Huizi and Dally, William J},
journal={arXiv preprint arXiv:1510.00149},
\ No newline at end of file
\ No newline at end of file
%% This is file `eso-pic.sty',
%% generated with the docstrip utility.
%% The original source files were:
%% eso-pic.dtx (with options: `package')
%% This is a generated file.
%% Copyright (C) 1998-2002 by Rolf Niepraschk <niepraschk@ptb.de>
%% This file may be distributed and/or modified under the conditions of
%% the LaTeX Project Public License, either version 1.2 of this license
%% or (at your option) any later version. The latest version of this
%% license is in:
%% http://www.latex-project.org/lppl.txt
%% and version 1.2 or later is part of all distributions of LaTeX version
%% 1999/12/01 or later.
[2002/11/16 v1.1b eso-pic (RN)]
\newcommand{\ESO@HookI}{} \newcommand{\ESO@HookII}{}
\newif\ifESO@dvips\ESO@dvipsfalse \newif\ifESO@grid\ESO@gridfalse
\newcommand*\ESO@subgridstyle{dotted}% ???
\define@key{ESO}{texcoord}[true]{\csname ESO@texcoord#1\endcsname}
\define@key{ESO}{pscoord}[true]{\csname @tempswa#1\endcsname
\define@key{ESO}{dvips}[true]{\csname ESO@dvips#1\endcsname}
\define@key{ESO}{grid}[true]{\csname ESO@grid#1\endcsname
\define@key{ESO}{colorgrid}[true]{\csname ESO@grid#1\endcsname
\@tempdimb=#2\@tempdimb\divide\@tempdima by \@tempdimb%
\multiply\@tempcntb by \ESO@gridDelta\relax%
\@tempdima=\@tempcntb sp\@tempdima=\ESO@labelfactor\@tempdima%
\multiply\@tempcntb by \ESO@gridDelta\relax%
\@tempdima=\@tempcntb sp\@tempdima=\ESO@labelfactor\@tempdima%
%% End of file `eso-pic.sty'.
Average Spatial Accuracy,0.9881829999999998,0.7253580000000001,0.38506300000000004
Average JPEG Accuracy,0.9881859999999998,0.7253489999999999,0.385062
\title{Deep Residual Learning in the JPEG Transform Domain}
\author{Max Ehrlich and Larry Davis\\
{\tt\small maxehr@umiacs.umd.edu} \qquad {\tt\small lsd@umiacs.umd.edu}\\
University of Maryland, College Park, MD, USA.
We introduce a general method of performing Residual Network inference and learning in the JPEG transform domain that allows the network to consume compressed images as input. Our formulation leverages the linearity of the JPEG transform to redefine convolution and batch normalization with a tune-able numerical approximation for ReLu. The result is mathematically equivalent to the spatial domain network up to the ReLu approximation accuracy. A formulation for image classification and a model conversion algorithm for spatial domain networks are given as examples of the method. We show that the sparsity of the JPEG format allows for faster processing of the images with little to no penalty in the network accuracy.
\ No newline at end of file
% ---------------------------------------------------------------
% $Id: iccv.sty,v 1.3 2005/10/24 19:56:15 awf Exp $
% by Paolo.Ienne@di.epfl.ch
% some mods by awf@acm.org
% ---------------------------------------------------------------
% no guarantee is given that the format corresponds perfectly to
% IEEE 8.5" x 11" Proceedings, but most features should be ok.
% ---------------------------------------------------------------
% with LaTeX2e:
% =============
% use as
% \documentclass[times,10pt,twocolumn]{article}
% \usepackage{latex8}
% \usepackage{times}
% ---------------------------------------------------------------
% with LaTeX 2.09:
% ================
% use as
% \documentstyle[times,art10,twocolumn,latex8]{article}
% ---------------------------------------------------------------
% with both versions:
% ===================
% specify \iccvfinalcopy to emit the final camera-ready copy
% specify references as
% \bibliographystyle{ieee}
% \bibliography{...your files...}
% ---------------------------------------------------------------
\typeout{ICCV 8.5 x 11-Inch Proceedings Style `iccv.sty'.}
% ten point helvetica bold required for captions
% eleven point times bold required for second-order headings
% in some sites the name of the fonts may differ,
% change the name here:
\font\iccvtenhv = phvb at 8pt % *** IF THIS FAILS, SEE iccv.sty ***
\font\elvbf = ptmb scaled 1100
% If the above lines give an error message, try to comment them and
% uncomment these:
%\font\iccvtenhv = phvb7t at 8pt
%\font\elvbf = ptmb7t scaled 1100
% set dimensions of columns, gap between columns, and paragraph indent
% memento from size10.clo
% \normalsize{\@setfontsize\normalsize\@xpt\@xiipt}
% \small{\@setfontsize\small\@ixpt{11}}
% \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}}
% \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt}
% \tiny{\@setfontsize\tiny\@vpt\@vipt}
% \large{\@setfontsize\large\@xiipt{14}}
% \Large{\@setfontsize\Large\@xivpt{18}}
% \LARGE{\@setfontsize\LARGE\@xviipt{22}}
% \huge{\@setfontsize\huge\@xxpt{25}}
% \Huge{\@setfontsize\Huge\@xxvpt{30}}
\vskip .375in
{\Large \bf \@title \par}
% additional two empty lines at the end of the title
\lineskip .5em
\ificcvfinal\@author\else Anonymous ICCV submission\\
\vspace*{1pt}\\%This space will need to be here in the final copy, so don't squeeze it out for the review copy.
Paper ID \iccvPaperID \fi
% additional small space at the end of the author name
\vskip .5em
% additional empty line at the end of the title block
\centerline{\large\bf Abstract}%
% additional empty line at the end of the abstract
\def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{}
\setbox\@tempboxa\hbox{\small \noindent #1.~#2}
% IF longer than one indented paragraph line
\ifdim \wd\@tempboxa >\@ctmp
% THEN DON'T set as an indented paragraph
{\small #1.~#2\par}
% ELSE center
\hbox to\hsize{\hfil\box\@tempboxa\hfil}
% correct heading spacing and type
\def\iccvsection{\@startsection {section}{1}{\z@}
{10pt plus 2pt minus 2pt}{7pt} {\large\bf}}
\def\iccvsect#1{\iccvsection{\hskip -1em.~#1}}
\def\iccvsubsection{\@startsection {subsection}{2}{\z@}
{8pt plus 2pt minus 2pt}{6pt} {\elvbf}}
\def\iccvsubsect#1{\iccvsubsection{\hskip -1em.~#1}}
%% --------- Page background marks: Ruler and confidentiality
% ----- define vruler
% NUMBER with left flushed zeros \fillzeros[<WIDTH>]<NUMBER>
\newcount\cv@tmpc@ \newcount\cv@tmpc
\cv@tmpc=1 %
\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
\ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
\global\setbox\iccvrulerbox=\vbox to \textheight{%
\loop\setbox\cv@tmpbox=\hbox to0cm{{\iccvtenhv\hfil\fillzeros[#4]\iccvrulercount}}%
% ----- end of vruler
% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
% \color[gray]{.15}\framebox(\LenToUnit{\textwidth},\LenToUnit{\textheight}){}
\advance\iccvruleroffset by -3.7pt
\put(\LenToUnit{-35pt},\LenToUnit{-\iccvruleroffset}){%left ruler
\iccvtmppos=\textwidth\advance\iccvtmppos by 30pt
\put(\LenToUnit{\iccvtmppos},\LenToUnit{-\iccvruleroffset}){%right ruler
\def\pid{\parbox{1in}{\begin{center}\bf\sf{\small ICCV}\\\#\iccvPaperID\end{center}}}
\AtTextUpperLeft{%paperID in corners
\iccvtmppos=\textwidth\advance\iccvtmppos by -8pt
%%% Make figure placement a little more predictable.
% We trust the user to move figures if this results
% in ugliness.
% Minimize bad page breaks at figures
% Add a period to the end of an abbreviation unless there's one
% already, then \xspace.
\def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot}
\def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot}
\def\cf{\emph{c.f}\onedot} \def\Cf{\emph{C.f}\onedot}
\def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot}
\def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot}
\def\etal{\emph{et al}\onedot}
% ---------------------------------------------------------------
%% This is file `everyshi.sty',
%% generated with the docstrip utility.
%% The original source files were:
%% everyshi.dtx (with options: `package')
%% Copyright (C) [1994..1999] by Martin Schroeder. All rights reserved.
%% This file is part of the EveryShi package
%% This program may be redistributed and/or modified under the terms
%% of the LaTeX Project Public License, either version 1.0 of this
%% license, or (at your option) any later version.
%% The latest version of this license is in
%% CTAN:macros/latex/base/lppl.txt.
%% Happy users are requested to send me a postcard. :-)
%% The EveryShi package contains these files:
%% everyshi.asc
%% everyshi.dtx
%% everyshi.dvi
%% everyshi.ins
%% everyshi.bug
%% Error Reports in case of UNCHANGED versions to
%% Martin Schr"oder
%% Cr"usemannallee 3
%% D-28213 Bremen
%% Martin.Schroeder@ACM.org
%% File: everyshi.dtx Copyright (C) 2001 Martin Schr\"oder
[2001/05/15 v3.00 EveryShipout Package (MS)]
%% \CharacterTable
%% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z
%% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z
%% Digits \0\1\2\3\4\5\6\7\8\9
%% Exclamation \! Double quote \" Hash (number) \#
%% Dollar \$ Percent \% Ampersand \&
%% Acute accent \' Left paren \( Right paren \)
%% Asterisk \* Plus \+ Comma \,
%% Minus \- Point \. Solidus \/
%% Colon \: Semicolon \; Less than \<
%% Equals \= Greater than \> Question mark \?
%% Commercial at \@ Left bracket \[ Backslash \\
%% Right bracket \] Circumflex \^ Underscore \_
%% Grave accent \` Left brace \{ Vertical bar \|
%% Right brace \} Tilde \~}
%% \iffalse meta-comment
%% ===================================================================
%% @LaTeX-package-file{
%% author = {Martin Schr\"oder},
%% version = "3.00",
%% date = "15 May 2001",
%% filename = "everyshi.sty",
%% address = {Martin Schr\"oder
%% Cr\"usemannallee 3
%% 28213 Bremen
%% Germany},
%% telephone = "+49-421-2239425",
%% email = "martin@oneiros.de",
%% pgp-Key = "2048 bit / KeyID 292814E5",
%% pgp-fingerprint = "7E86 6EC8 97FA 2995 82C3 FEA5 2719 090E",
%% docstring = "LaTeX package which provides hooks into
%% \cs{shipout}.
%% }
%% ===================================================================
%% \fi
\global\setbox\@cclv= %
\message{ABD: EveryShipout initializing macros}%
%% End of file `everyshi.sty'.
This diff is collapsed.
\usepackage[usenames, dvipsnames]{color}
%\newcommand{\etal}{\textit{et al}. }
%\newcommand{\ie}{\textit{i}.\textit{e}., }
%\newcommand{\eg}{\textit{e}.\textit{g}. }
\newcommand{\TODO}{{\color{red}\textbf{TODO }}}
\ No newline at end of file
% Include other packages here, before hyperref.
% If you comment hyperref and then uncomment it, you should delete
% egpaper.aux before re-running latex. (Or just hit 'q' on the first latex
% run, let it finish, and you should be clear).
%\iccvfinalcopy % *** Uncomment this line for the final submission
\def\iccvPaperID{****} % *** Enter the ICCV Paper ID here
% Pages are numbered in submission mode, and unnumbered in camera-ready
We briefly review the JPEG compression/decompression algorithm \cite{wallace1992jpeg} and introduce the tensor method that we use to formulate our networks \cite{smith1994fast}.
\subsection{JPEG Compression}
The JPEG compression algorithm is defined as the following steps.
\item Divide the image into $8 \times 8$ blocks
\item Compute the 2D forward Discrete Cosine Transform (DCT Type 2) of each block
\item Linearize the blocks using a zigzag order to produce a 64 component vector
\item Element-wise divide each vector by a quantization coefficient
\item Round the the vector elements to the nearest integer
\item Entropy code the vectors
Where the 2D Type 2 DCT for an $N \times N$ block is given by
D_{\alpha\beta} = \frac{1}{\sqrt{2N}}A(\alpha)A(\beta)\sum_{i=0}^N\sum_{j=0}^N I_{ij} \times \nonumber \\ \cos\left(\frac{(2i+1)\alpha\pi}{2N}\right) \cos\left(\frac{(2j+1)\beta\pi}{2N}\right) \\
A(k) = \begin{cases}
\frac{1}{\sqrt{2}} & k = 0 \\
1 & k \neq 0
This process is repeated independently for each image plane. In most cases, the original image is transformed from the RGB color space to YUV and chroma subsampling is applied since the human visual system is less sensitive to small color changes than to small brightness changes \cite{winkler2001vision}. The decompression algorithm is the inverse process. Note that the rounding step (step 5) must be skipped during decompression, this is the step in JPEG compression where information is lost and is the cause of artifacting in decompressed JPEG images.
The magnitude of the information loss can be tuned using the quantization coefficients. If a larger coefficient is applied in step 4, then the result will be closer to 0 which increases its likelihood of being dropped altogether. In this way, the JPEG transform forces sparsity on the representation, which why it compresses the image data so well. This is coupled with the tendency of the DCT to push the magnitude of the coefficients into the upper left corner (the DC coefficient and the lowest spatial frequency) to result in high spatial frequencies being dropped. Not only do these high spatial frequencies contribute less response to the human visual system, but they are also the optimal set to drop for a least squares reconstruction of the original image:
\begin{theorem}[DCT Least Squares Approximation Theorem]
Given a set of $N$ samples of a signal $X = \{x_0, ... x_N\}$, let $Y = \{y_0, ... y_N\}$ be the DCT coefficients of $X$. Then, for any $1 \leq m \leq N$, the approximation
p_m(t) = \frac{1}{\sqrt{n}}y_o + \sqrt{\frac{2}{n}}\sum_{k=1}^{m} y_k\cos\left(\frac{k(2t + 1)\pi}{2n}\right)
of $X$ minimizes the least squared error
e_m = \sum_{i=0}^{n} (p_m(i) - x_i)^2
Theorem \ref{thm:dctls} states that a reconstruction using the $m$ lowest spatial frequencies is optimal with respect to any other set of $m$ spatial frequencies. Proof of Theorem \ref{thm:dctls} is given in the supplementary material.
\subsection{JPEG Linear Map}
A key observation of the JPEG algorithm, and the foundation of most compressed domain processing methods \cite{chang1992video, chang1993new, natarajan1995fast, shen1995inner, shen1996direct, shen1998block, smith1993algorithms, smith1994fast} is that steps 1-4 of the JPEG compression algorithm are linear maps, so they can be composed, along with other linear operations, into a single linear map which performs the operations on the compressed representation. Step 5, the rounding step, cannot be undone and Step 6, the entropy coding, is nonlinear and therefore must be undone. We define the JPEG Transform Domain as the output of Step 4 in the JPEG encoding algorithm. Inputs the the algorithms described here will be JPEGs after reversing the entropy coding.
Formally, we model a single plane image as the type (0, 2) tensor $I \in V^* \otimes V^*$ for some vector space $V$ and its dual $V^*$. Note that we take all tensors as a tensor product combination of $V$ and $V^*$ without loss of generality. In real images, the dimensions have physical meaning (\eg width and height of the image) and will be of different sizes. The analysis presented in this work applies to any vector space although in real images we are dealing with floating point numbers. The basis of $V$ is always the standard orthonormal basis, this is important as it allows the free raising and lowering of indices without the use of a metric tensor.
We define the JPEG transform $J \in V \otimes V \otimes V^* \otimes V^* \otimes V^*$, a type (2,3) tensor. Then $J$ represents a linear map $J: V^* \otimes V^* \rightarrow V^* \otimes V^* \otimes V^*$ which is computed as (in Einstein notation)
I'_{xyk} = J^{sr}_{xyk}I_{sr}
and we say that $I'$ is the representation of $I$ in the JPEG transform domain. In the above equation, the indices $s,r$ give the pixel position, the indices $x,y$ give the block position, and the index $k$ gives the offset into the block.
The form of $J$ is constructed from the JPEG compression steps listed in the previous section. Let the linear map $B: V^* \otimes V^* \rightarrow V^* \otimes V^* \otimes V^* \otimes V^*$ be defined as
B^{sr}_{xyij} = \left\{ \begin{array}{lr} 1 & \text{$s,r$ belongs in block $x,y$ at offset $i,j$} \\ 0 & \text{otherwise} \end{array} \right.
then $B$ can be used to break the image represented by $I$ into blocks of a given size such that the first two indices $x,y$ index the block position and the last two indices $i,j$ index the offset into the block.
Next. let the linear map $D: V^* \otimes V^* \rightarrow V^* \otimes V^*$ be defined as
D^{ij}_{\alpha\beta} = \frac{1}{4}A(\alpha)A(\beta)\cos\left(\frac{(2i+1)\alpha\pi}{16}\right)\cos\left(\frac{(2j+1)\beta\pi}{16}\right)
then $D$ represents the 2D discrete forward (and inverse) DCT. Let $Z: V^* \otimes V^* \rightarrow V^*$ be defined as
Z^{\alpha\beta}_\gamma = \left\{ \begin{array}{lr} 1 & \text{$\alpha, \beta$ is at $\gamma$ under zigzag ordering} \\ 0 & \text{otherwise} \end{array} \right.
then $Z$ creates the zigzag ordered vectors. Finally, let $S: V^* \rightarrow V^*$ be
S^\gamma_k = \frac{1}{q_k}
where $q_k$ is a quantization coefficient, $S$ can be used to scale the vector entries by their quantization coefficients.
With linear maps for each step of the JPEG transform, we can then apply them to each other to create the $J$ tensor that was described at the beginning of this section
J^{sr}_{xyk} = B^{sr}_{xyij}D^{ij}_{\alpha\beta}Z^{\alpha\beta}_{\gamma}S^\gamma_k
The inverse mapping also exists as a tensor $\widetilde{J}$ which can be defined using the same linear maps with the exception of $S$. Let $\widetilde{S}$ be
\widetilde{S}^k_\gamma = q_k
\widetilde{J}^{xyk}_{sr} = B_{sr}^{xyij}D_{ij}^{\alpha\beta}Z_{\alpha\beta}^{\gamma}\widetilde{S}^k_\gamma
noting that, for all tensors other than $\widetilde{S}$, we have freely raised and lowered indices without the use of a metric tensor on $V$ since we consider only the standard orthonormal basis, as stated earlier.
Next consider a linear map $C: V^* \otimes V^* \rightarrow V^* \otimes V^*$ which performs an arbitrary pixel manipulation on an image $I$. To apply this mapping to a JPEG image $I'$, we would first decompress the image, apply $C$ to the result, then compress that result to get the final JPEG. Since compressing is an application of $J$ and decompressing is an application of $\widetilde{J}$, we can form a new linear map $\Xi: V^* \otimes V^* \otimes V^* \rightarrow V^* \otimes V^* \otimes V^*$ as
\Xi^{xyk}_{x'y'k'} = \widetilde{J}^{xyk}_{sr}C^{sr}_{s'r'}J^{s'r'}_{x'y'k'}
which applies $C$ in the JPEG transform domain. There are two important points to note about $\Xi$. The first is that, although it encapsulates decompression, applying $C$ and compressing, it uses far fewer operations than doing these processes separately since the coefficients are multiplied out. The second is that it is mathematically equivalent to performing $C$ on the decompressed image and compressing the result, it is not an approximation.
\ No newline at end of file
\section{Conclusion and Future Work}
In this work we showed how to formulate deep residual learning in the JPEG transform domain, and we showed that it provides a notable performance benefit in terms of processing time for each image. Our method uses a model of convolutions as a linear map \cite{smith1994fast} and introduces a novel approximation technique for ReLu which, to our knowledge, is the first rigorous attempt at defining a non-linear function in the JPEG transform domain. We showed that the approximation can achieve highly performant results with little impact on the overall classification accuracy.
Future work should focus on two main points. The first is efficiency of representation. Our linear maps take up more space, especially when stored in dense tensor data structures, than spatial domain convolutions. This makes it hard to scale the networks to datasets with large image sizes. Secondly, library support in commodity deep learning libraries for some of the features required by this algorithm are lacking. As of this writing, true sparse tensor support is missing in all of PyTorch \cite{paszke2017automatic}, TensorFlow \cite{tensorflow2015-whitepaper}, and Caffe \cite{jia2014caffe}, with these tensors being represented as coordinate lists which are known to be highly non-performant. Additionally, the \texttt{einsum} function for evaluating multilinear expressions is not fully optimized in these libraries when compared to the speed of convolutions in libraries like CuDNN \cite{chetlur2014cudnn}.
\ No newline at end of file
We give experimental evidence for the efficacy of our method, starting with a brief discussion of the architectures we use and the datasets for experimentation. We use model conversion as a sanity check, ensuring that the JPEG model with exact ReLu matches exactly the testing accuracy of a spatial domain model. Next we show how the ReLu approximation accuracy effects the overall network performance. We conclude by showing the training and testing time advantage of our method.
\subsection{Network Architectures and Datasets}
Since we are concerned with reproducing the inference results of spatial domain networks, we choose the MNIST \cite{lecun1998mnist} and CIFAR-10/100 \cite{krizhevsky2009learning} datasets since they are easy to work with. The MNIST images are padded to $32 \times 32$ before being used to ensure an even number of JPEG blocks. Our network architecture is similarly simple is shown in Figure \ref{fig:na}. The classification network consists of three residual blocks with the final two performing downsampling so that the final feature map consists of a single JPEG block.
\caption{Simple network architecture. $T$ indicates the batch size.}
\subsection{Model Conversion}
\TODO Show that on both datasets over several (maybe a hundred?) of both models trained in spatial domain, testing in spatial domain matches testing in JPEG domain.
% CIFAR10:
\subsection{ReLu Approximation Accuracy}
\TODO two things to show here. The first is a large scale test over $8 \times 8$ blocks that shows how the error of the ReLu approximation itself changes from 0-14 spatial frequencies (same thing from the notes). Second is a similar test but using (again like 100) models of both types for both datasets, show how the testing accuracy changes from 0-14 spatial frequencies. Finally, show that if you train and test like 100 models using 0-14 spatial frequencies, show that there's less error because the convolutional weights will learn to cope with the appx.
\subsection{Efficiency of Training and Testing}
\TODO simple test here, show averaged timing results for training and testing both datasets, then show images/sec for inference for both models. Try to compute number of operations on average by measuring sparsity (???)
\ No newline at end of file
\@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {1}\hskip -1em.\nobreakspace {}Introduction}{4322}{section.1}}
The popularization of deep learning since the 2012 AlexNet \cite{krizhevsky2012imagenet} architecture has led to unprecedented gains for the field. Many applications that were once academic are now seeing widespread use of machine learning with success. Although the performance of deep neural networks far exceeds classical methods, there are still some major problems with the algorithms from a computational standpoint. Deep networks require massive amounts of data to learn effectively, especially for complex problems \cite{najafabadi2015deep}. Further, the computational and memory demands of deep networks mean that for many large problems, only large institutions with GPU clusters can afford to train from scratch, leaving the average scientist to fine tune pre-trained weights.
This problem has been addressed many times in the literature. Batch normalization \cite{ioffe2015batch} is ubiquitous in modern networks to accelerate their convergence. Residual learning \cite{he2016deep} allows for much deeper networks to learn effective mappings without overfitting. Techniques such as pruning and weight compression \cite{han2015deep} are becoming more commonplace. As problems become even larger and more complex, these techniques are increasingly being relied upon for efficient training and inference.
We tackle this problem at the level of the image representation. The JPEG image compression algorithm is the most widespread image file format. Traditionally, the first step in using JPEGs for machine learning is to decompress them. We propose to skip this step and instead reformulate the ResNet architecture to perform its operations directly on compressed images. The goal is to produce a new network that is mathematically equivalent to the spatial domain network, but which operates on compressed images by composing the compression transform into the network weights, which can be done because they are both linear maps. Because of the ReLu function is non-linear, we develop an approximation technique for it. This is a general method and, to our knowledge, is the first attempt at formulating a piecewise linear function in the transform domain presented in the literature.
The contributions of this work are as follows
\item The general method for expressing convolutional networks in the JPEG domain
\item Concrete formulation for residual blocks to perform classification
\item A model conversion algorithm to apply pretrained spatial domain networks to JPEG images
\item Approximated Spatial Masking: the first general technique for application of piecewise linear functions in the transform domain
By skipping the decompression step and by operating on the sparser compressed format, we show a notable increase in speed for training and inference.
\ No newline at end of file
This diff is collapsed.
\section{Network Architectures and Model Conversion}
\ No newline at end of file
\section{Prior Work}
We briefly review prior work separated into three categories: compressed domain operations, machine learning in the compressed domain, and deep learning in the compressed domain.
\subsection{Compressed Domain Operations}
The expression of common operations in the compressed domain was an extremely active area of study in the late 80s and early 90s, motivated by the lack of computing power to quickly decompress, process, and recompress images and video. On the JPEG side, Smith and Rowe \cite{smith1993algorithms} formulate fast JPEG compatible algorithms for performing scalar and pixelwise addition and multiplication. This was extended by Shen and Sethi \cite{shen1995inner} to general blockwise operations and by Smith \cite{smith1994fast} to arbitrary linear maps. Natarajan and Vasudev \cite{natarajan1995fast} additionally formulate an extremely fast approximate algorithm for scaling JPEG images. On the MPEG side, Chang \etal \cite{chang1992video} introduce the basic algorithms for manipulating compressed video. Chang and Messerschmitt \cite{chang1993new} give a fast algorithm for decoding motion compensation before DCT which allows arbitrary video compositing operations to be performed.
\subsection{Machine Learning in the Compressed Domain}
Compressed domain machine learning grew out of the work in the mid 90s. Arman \etal \cite{arman1993image} give the basic framework for image processing of compressed images. Feng and Jiang \cite{feng2002jpeg} show how image retrieval can be performed directly on compressed JPEGs. He \etal \cite{he2009efficient} extend their work with a hypothesis testing technique. Wu \etal \cite{wu2013sift} formulate the popular SIFT feature extraction in the DCT domain.
\subsection{Deep Learning in the Compressed Domain}
Because deep learning in particular is a non-linear map, it has received limited study in the compressed domain. Ghosh and Chellappa \cite{ghosh2016deep} use a DCT as part of their network's first layer and show that it speeds up convergence for training. Wu \etal \cite{wu2018compressed} formulate a deep network for video action recognition that uses a separate network for i-frames and p-frames. Since the p-frame network functions on raw motion vectors and error residuals it is considered compressed domain processing, although it works in the spatial domain and not the quantized frequency domain as in this work. Wu \etal show a significant efficiency advantage compared to traditional 3D convolution architectures, which they attribute to the p-frame data being a minimal representation of the video motion. Gueguen \etal \cite{gueguen_2018_ICLR} formulate a traditional ResNet that operates on DCT coefficients directly instead of pixels, \eg the DCT coefficients are fed to the network. They show that learning is able to converge faster on this input, further motivating the JPEG representation.
\ No newline at end of file
\ No newline at end of file
\ No newline at end of file