\title{Supplementary Material for Deep Residual Learning in the JPEG Transform Domain}
\maketitle
\section{Proof of the DCT Least Squares Approximation Theorem}
\begin{theorem}[DCT Least Squares Approximation Theorem]
Given a set of $N$ samples of a signal $X =\{x_0, ... x_N\}$, let $Y =\{y_0, ... y_N\}$ be the DCT coefficients of $X$. Then, for any $1\leq m \leq N$, the approximation
First consider that since Equation \ref{eq:dct1d} represents the Discrete Cosine Transform, which is a Linear map, we can write rewrite it as
\begin{equation}
D^T_my = x
\end{equation}
where $D_m$ is formed from the first $m$ rows of the DCT matrix, $y$ is a row vector of the DCT coefficients, and $x$ is a row vector of the original samples.
To solve for the least squares solution, we use the the normal equations, that is we solve
\begin{equation}
D_mD^T_my = D_mx
\end{equation}
and since the DCT is an orthonormal transformation, the rows of $D_m$ are orthogonal, so $D_mD^T_m = I$. Therefore
\begin{equation}
y = D_mx
\end{equation}
Since there is no contradiction, the least squares solution must use the first $m$ DCT coefficients.
\end{proof}
\section{Proof of the DCT Mean-Variance Theorem}
\begin{theorem}[DCT Mean-Variance Theorem]
Given a set of samples of a signal $X$ such that $\e[X]=0$, let $Y$ be the DCT coefficients of $X$. Then
\begin{equation}
\var[X] = \e[Y^2]
\end{equation}
\end{theorem}
\begin{proof}
Start by considering $\var[X]$. We can rewrite this as
\begin{equation}
\var[X] = \e[X^2] - \e[X]^2
\end{equation}
Since we are given $\e[X]=0$, this simplifies to
\begin{equation}
\var[X] = \e[X^2]
\end{equation}
Next, we express the DCT as a linear map such that $X = DY$ and rewrite the previous equation as
\begin{equation}
\var[X] = \e[(DY)^2]
\end{equation}
Distributing the squaring operation gives
\begin{equation}
\e[(DY)^2] = \e[(D^TD)Y^2]
\end{equation}
Since $D$ is orthogonal this simplifies to
\begin{equation}
\e[(D^TD)Y^2] = \e[(D^{-1}D)Y^2] = \e[Y^2]
\end{equation}
\end{proof}
\section{Algorithms}
We conclude by outlining in pseudocode the algorithms for the three layer operations described in the paper. Algorithm \ref{alg:dce} gives the code for convolution explosion, Algorithm \ref{alg:asmr} gives the code for the ASM ReLu approximation, and Algorithm \ref{alg:bn} gives the code for Batch Normalization.
\captionof{algorithm}{Convolution Explosion. $K$ is an initial filter, $m, n$ are the input and output channels, $h, w$ are the image height and width, $s$ is the stride, $\star_s$ denotes the discrete convolution with stride $s$}
\captionof{algorithm}{Approximated Spatial Masking for ReLu. $F$ is a DCT domain block, $\phi$ is the desired maximum spatial frequencies, $N$ is the block size.}
\captionof{algorithm}{Batch Normalization. $F$ is a batch of JPEG blocks (dimensions $N \times64$), $S$ is the inverse quantization matrix, $m$ is the momentum for updating running statistics, $t$ is a flag that denotes training or testing mode. The parameters $\gamma$ and $\beta$ are stored externally to the function. $\widehat{}\;$ is used to denote a batch statistic and $\tilde{}\;$ is used to denote a running statistic.}