\section{Proof of the DCT Least Squares Approximation Theorem}

\begin{theorem}[DCT Least Squares Approximation Theorem]

Given a set of $N$ samples of a signal $X =\{x_0, ... x_N\}$, let $Y =\{y_0, ... y_N\}$ be the DCT coefficients of $X$. Then, for any $1\leq m \leq N$, the approximation

First consider that since Equation \ref{eq:dct1d} represents the Discrete Cosine Transform, which is a Linear map, we can write the least squares error as

\begin{equation}

D^T_my = x

\end{equation}

where $D_m$ is formed from the first $m$ rows of the DCT matrix, $y$ is a row vector of the DCT coefficients, and $x$ is a row vector of the original samples.

To solve for the least squares solution, we use the the normal equations, that is we solve

\begin{equation}

D_mD^T_my = D_mx

\end{equation}

and since the DCT is an orthonormal transformation, the rows of $D_m$ are orthogonal, so $D_mD^T_m = I$. Therefore

\begin{equation}

y = D_mx

\end{equation}

Since there is no contradiction, the least squares solution must use the first $m$ DCT coefficients.

\end{proof}

\section{Proof of the DCT Mean-Variance Theorem}

\begin{theorem}[DCT Mean-Variance Theorem]

Given a set of samples of a signal $X$ such that $\e[X]=0$, let $Y$ be the DCT coefficients of $X$. Then

\begin{equation}

\var[X] = \e[Y^2]

\end{equation}

\end{theorem}

\begin{proof}

\end{proof}

\section{Algorithms}

\begin{algorithm}

\caption{Direct Convolution Explosion. $K$ is an initial filter, $m, n$ are the input and output channels, $h, w$ are the image height and width, $s$ is the stride, $\star_s$ denotes the discrete convolution with stride $s$}

\caption{Convolution Explosion. $K$ is an initial filter, $m, n$ are the input and output channels, $h, w$ are the image height and width, $s$ is the stride, $\star_s$ denotes the discrete convolution with stride $s$}

\label{alg:dce}

\begin{algorithmic}

\Function{Explode}{$K, m, n, h, w, s$}

...

...

@@ -51,7 +91,7 @@

\end{algorithm}

\begin{algorithm}

\caption{Automated Spatial Masking for ReLu. $F$ is a DCT domain block, $\phi$ is the desired maximum spatial frequencies, $N$ is the block size.}

\caption{Approximated Spatial Masking for ReLu. $F$ is a DCT domain block, $\phi$ is the desired maximum spatial frequencies, $N$ is the block size.}