 ... ... @@ -18,10 +18,15 @@ % Pages are numbered in submission mode, and unnumbered in camera-ready \ificcvfinal\pagestyle{empty}\fi \setcounter{page}{4321} \addbibresource{bibliography.bib} \DeclareCaptionFormat{algor}{% \hrulefill\par\offinterlineskip\vskip1pt% \textbf{#1#2}#3\offinterlineskip\hrulefill} \DeclareCaptionStyle{algori}{singlelinecheck=off,format=algor,labelsep=space} \captionsetup[algorithm]{style=algori} \begin{document} \title{Supplementary Material} ... ... @@ -44,7 +49,7 @@ \label{thm:dctls} \end{theorem} \begin{proof} First consider that since Equation \ref{eq:dct1d} represents the Discrete Cosine Transform, which is a Linear map, we can write the least squares error as First consider that since Equation \ref{eq:dct1d} represents the Discrete Cosine Transform, which is a Linear map, we can write rewrite it as \begin{equation} D^T_my = x \end{equation} ... ... @@ -70,14 +75,35 @@ Since there is no contradiction, the least squares solution must use the first $\end{equation} \end{theorem} \begin{proof} Start by considering$\var[X]$. We can rewrite this as \begin{equation} \var[X] = \e[X^2] - \e[X]^2 \end{equation} Since we are given$\e[X] = 0$, this simplifies to \begin{equation} \var[X] = \e[X^2] \end{equation} Next, we express the DCT as a linear map such that$X = DY$and rewrite the previous equation as \begin{equation} \var[X] = \e[(DY)^2] \end{equation} Distributing the squaring operation gives \begin{equation} \e[(DY)^2] = \e[(D^TD)Y^2] \end{equation} Since$D$is orthogonal this simplifies to \begin{equation} \e[(D^TD)Y^2] = \e[(D^{-1}D)Y^2] = \e[Y^2] \end{equation} \end{proof} \section{Algorithms} \begin{algorithm} \caption{Convolution Explosion.$K$is an initial filter,$m, n$are the input and output channels,$h, w$are the image height and width,$s$is the stride,$\star_s$denotes the discrete convolution with stride$s$} \label{alg:dce} \begin{algorithmic} We conclude by outlining in pseudocode the algorithms for the three layer operations described in the paper. Algorithm \ref{alg:dce} gives the code for convolution explosion, Algorithm \ref{alg:asmr} gives the code for the ASM ReLu approximation, and Algorithm \ref{alg:bn} gives the code for Batch Normalization. \captionof{algorithm}{Convolution Explosion.$K$is an initial filter,$m, n$are the input and output channels,$h, w$are the image height and width,$s$is the stride,$\star_s$denotes the discrete convolution with stride$s$} \label{alg:dce} \begin{algorithmic} \Function{Explode}{$K, m, n, h, w, s$} \State$d_j \gets \mathbf{shape}(\widetilde{J})$\State$d_b \gets (d_j, d_j, d_j, 1, h, w)$... ... @@ -87,13 +113,11 @@ Since there is no contradiction, the least squares solution must use the first$ \State $\widetilde{C} \gets \mathbf{reshape}(\widehat{C}, d_c)$ \State $\mathbf{return} \; \widetilde{C}J$ \EndFunction \end{algorithmic} \end{algorithm} \end{algorithmic} \begin{algorithm} \caption{Approximated Spatial Masking for ReLu. $F$ is a DCT domain block, $\phi$ is the desired maximum spatial frequencies, $N$ is the block size.} \label{alg:asmr} \begin{algorithmic} \captionof{algorithm}{Approximated Spatial Masking for ReLu. $F$ is a DCT domain block, $\phi$ is the desired maximum spatial frequencies, $N$ is the block size.} \label{alg:asmr} \begin{algorithmic} \Function{ReLu}{$F, \phi, N$} \State $M \gets$ \Call{ANNM}{$F, \phi, N$} \State $\mathbf{return}\;$ \Call{ApplyMask}{$F, M$} ... ... @@ -121,13 +145,31 @@ Since there is no contradiction, the least squares solution must use the first $\Function{ApplyMask}{$F, M$} \State$\mathbf{return} \; H^{\alpha\beta ij}_{\alpha'\beta'}F_{\alpha\beta}M_{ij}$\EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Batch Normalization} \label{alg:bn} \end{algorithm} \end{algorithmic} \captionof{algorithm}{Batch Normalization.$F$is a batch of JPEG blocks (dimensions$N \times 64$),$S$is the inverse quantization matrix,$m$is the momentum for updating running statistics,$t$is a flag that denotes training or testing mode. The parameters$\gamma$and$\beta$are stored externally to the function.$\widehat{}\;$is used to denote a batch statistic and$\tilde{}\;$is used to denote a running statistic.} \label{alg:bn} \begin{algorithmic} \Function{BatchNorm}{$F$,$S$,$m$,$t$} \If{$t$} \State$\mu \gets \mathbf{mean}(F[:, 0])$\State$\widehat{\mu} \gets F[:, 0]$\State$F[:, 0] = 0$\State$D_g \gets F_kS_k$\State$\widehat{\sigma^2} \gets \mathbf{mean}(F^2, 1)$\State$\sigma^2 \gets \mathbf{mean}(\widehat{\sigma^2} + \widehat{\mu}^2) - \mu^2$\State$\widetilde{\mu} \gets \widetilde{\mu}(1 - m) + \mu m$\State$\widetilde{\sigma^2} \gets \widetilde{\sigma^2}(1 - m) + \mu m$\State$F[:, 0] \gets F[:, 0] - \mu$\State$F \gets \frac{\gamma F}{\sigma}$\State$F[:, 0] \gets F[:, 0] + \beta$\Else \State$F[:, 0] \gets F[:, 0] - \widetilde{\mu}$\State$F \gets \frac{\gamma F}{\widetilde{\sigma}}$\State$F[:, 0] \gets F[:, 0] + \beta$\EndIf \State$\mathbf{return} \; F\$ \EndFunction \end{algorithmic} \end{document}
