commit c4535b7714216664d1adb6b0f4ed3b8bbabeb70d
parent 688d349790b812e539d6dc38ebfedfd454d338b2
Author: miksa234 <milutin@popovic.xyz>
Date: Wed, 17 Jan 2024 14:09:51 +0000
finished summary too
Diffstat:
15 files changed, 237 insertions(+), 114 deletions(-)
diff --git a/opt_sem/pres/pres.tex b/opt_sem/pres/pres.tex
@@ -189,7 +189,7 @@
\end{align*}
\end{minipage}
\end{center}
- where $\varepsilon := \min
+ where $\varepsilon_0 := \min
\{\frac{\eta(\theta_*x_\text{min})^{2}-1}{3},\; 0.02\}$.
\newline
And \textbf{almost surely} there exists $t, k >0$ s.t.
@@ -241,7 +241,7 @@
\item For $\theta_t \in I_3$, then $\theta_{t+1} \in I_1
\cup I_2$.
\item For $\theta_t \in I_2$, then there is a $k>0$:
- $\forall k' < k$, it holds that $\theta{t+2k'} \in
+ $\forall k' < k$, it holds that $\theta_{t+2k'} \in
I_2$ and $\theta_{t+2k} \in I_1$
\item For $\theta_t \in I_1$, then $\forall k \ge 0$, it
holds that
@@ -317,7 +317,7 @@
\begin{itemize}[<+->]
\item Heuristics: rewrite SGD iteration, where $V_t(\theta_t, i_t) =
\sqrt{\eta}\left(\nabla_\theta f(\theta_k) - \nabla
- f_{i_t}(\theta_t) \right) $ is $d$-dimensional r.v.
+ f_{i_t}(\theta_t) \right) $ is $p$-dimensional r.v.
\begin{center}
\begin{minipage}{0.5\textwidth}
\begin{align*}
@@ -347,23 +347,23 @@
\begin{center}
\begin{minipage}{0.5\textwidth}
\begin{align*}
- d\theta\tau = b(\theta_\tau)d\tau
+ d\theta_\tau = b(\theta_\tau)d\tau
+\sqrt{\eta}\sigma(\theta_\tau)dB_\tau,
\end{align*}
\end{minipage}
\end{center}
- where $\theta_\tau \in \mathbb{R}^{n}$, $B_\tau$ standard p-dim.
+ where $\theta_\tau \in \mathbb{R}^{p}$, $B_\tau$ standard p-dim.
Brownian
- motion, $b:\mathbb{R}^{n} \to \mathbb{R}^{n}$ the drift and
+ motion, $b:\mathbb{R}^{p} \to \mathbb{R}^{p}$ the drift and
$\sigma: \mathbb{R}^{p}\to \mathbb{R}^{p\times p}$ the diffusion
matrix
\item Apply Euler discretization with step size $\eta$ and
- approximate $X_{\tau \eta}$ simply with $\hat{X}_{\tau}$
+ approximate $\theta_{\tau \eta}$ simply with $\hat{\theta}_{\tau}$
\begin{center}
\begin{minipage}{0.5\textwidth}
\begin{align*}
- d\hat{\theta}_t= \hat{\theta}_t + \eta b(\theta_t)
+ \hat{\theta}_t= \hat{\theta}_t + \eta b(\theta_t)
+\eta \sigma(\hat{\theta}_t)Z_t,
\end{align*}
\end{minipage}
diff --git a/opt_sem/summary/cite.bib b/opt_sem/summary/cite.bib
@@ -17,12 +17,32 @@
}
@book{shalev2014understanding,
- title={Understanding Machine Learning: From Theory to Algorithms},
- author={Shalev-Shwartz, S. and Ben-David, S.},
- isbn={9781107057135},
- lccn={2014001779},
- series={Understanding Machine Learning: From Theory to Algorithms},
- url={https://books.google.pt/books?id=ttJkAwAAQBAJ},
- year={2014},
- publisher={Cambridge University Press}
+ title={Understanding Machine Learning: From Theory to Algorithms},
+ author={Shalev-Shwartz, S. and Ben-David, S.},
+ isbn={9781107057135},
+ lccn={2014001779},
+ series={Understanding Machine Learning: From Theory to Algorithms},
+ url={https://books.google.pt/books?id=ttJkAwAAQBAJ},
+ year={2014},
+ publisher={Cambridge University Press}
}
+
+@misc{pillaudvivien2022label,
+ title={Label noise (stochastic) gradient descent implicitly solves the Lasso for quadratic parametrisation},
+ author={Loucas Pillaud-Vivien and Julien Reygner and Nicolas Flammarion},
+ year={2022},
+ eprint={2206.09841},
+ archivePrefix={arXiv},
+ primaryClass={stat.ML}
+}
+
+
+@misc{li2018stochastic,
+ title={Stochastic Modified Equations and Dynamics of Stochastic Gradient Algorithms I: Mathematical Foundations},
+ author={Qianxiao Li and Cheng Tai and Weinan E},
+ year={2018},
+ eprint={1811.01558},
+ archivePrefix={arXiv},
+ primaryClass={cs.LG}
+}
+
diff --git a/opt_sem/summary/main.tex b/opt_sem/summary/main.tex
@@ -64,81 +64,170 @@ Two important things are are shown in the above Proposition.
to constant effective scale of label noise, which is clearly
explained thorough out the paper summary \cite{andriushchenko2023sgd}.
\end{enumerate}
-\section{Dynamics behind Loss Stabilization}
-For the generic quadratic loss $F(\beta) := \|X\beta - y\|^{2}$, gradient
-descent converges with step size $\eta < \frac{2}{\lambda_{\text{max}}}$,
-diverges for $\eta > \frac{2}{\lambda_{\text{max}}}$ and converges to a
-bouncing 2-periodic dynamics for $\eta = \frac{2}{\lambda_{\text{max}}}$,
+\section{Loss Stabilization: Toy Model}
+For quadratic loss
+\begin{align}
+F(\beta) := \|X\beta - y\|^{2},
+\end{align}
+gradient descent
+\begin{itemize}
+ \item converges with step size $\eta < \frac{2}{\lambda_{\text{max}}}$,
+ \item diverges for $\eta > \frac{2}{\lambda_{\text{max}}}$
+ \item and converges to a bouncing 2-periodic dynamics for $\eta =
+ \frac{2}{\lambda_{\text{max}}}$,
+\end{itemize}
where $\lambda_{\text{max}}$ is the largest eigenvalue of the Hessian. On the
-other for hand nonquadratic loss there exists an open interval of the step
-sizes for witch the GD algorithm neither converges nor diverges \cite{andriushchenko2023sgd}.
-Complementing this with an example were the loss stabilization occurs almost
-surely in the case of SGD. A regression example with quadratic
-parametrization on the one dimensional data inputs $x_i$ from a distribution
-$\hat{\rho}$ and inputs generated by a linear model $y_i =
-x_i\theta_{*}^{2}$. With the loss $F(\theta) := \frac{1}{4}
-\mathbb{E}_{\hat{\rho}}(y - x\theta^{2})^{2}$, the std iterates with step
-size $\eta>0$ follow for $ t \in \mathbb{N}$
+other hand, for nonquadratic loss there exists an open interval of the step
+sizes for witch the GD algorithm neither converges nor diverges
+\cite{andriushchenko2023sgd}. Complementing this with an example were the
+loss stabilization occurs \textbf{almost surely} in the case of SGD.
+\newline
+
+A regression example with quadratic parametrization on the one dimensional
+data inputs $x_i \sim \hat{\rho}$ and inputs generated by a linear model $y_i
+= x_i\theta_{*}^{2}$. With the loss $F(\theta) := \frac{1}{4}
+\mathbb{E}_{\hat{\rho}}(y - x\theta^{2})^{2}$, the STD iterates with step
+size $\eta>0$ are for $ t \in \mathbb{N}$
\begin{align}
\theta_{t+1} + \theta_t + \eta \theta_t x_{i_t}(y_{i_t} -
x_{i_t}\theta_\text{t}^{2}).
\end{align}
-W.l.o.g., consider $\theta_* = 1 $ and $\text{supp}(\hat{\rho})=[a,b] \subset
-\mathbb{R}$. Then the following proposition holds
-
+In this case the following holds
\begin{proposition}
\label{prop: loss-stab}
- For any $\eta \in (a^{-2}, 1.25b^{-2})$ and initialization $\theta_0 \in
- (0, 1)$ for all $t>0$,
+ Assuming $\exists\; x_{\text{min}}, x_{\text{max}} > 0$ s.t.
+ $\text{supp}(\hat{\rho}) \subset [x_{\text{min}}, x_{\text{max}}]$. Then
+ for any $\eta \in ((\theta_*x_{\text{min}})^{-2},
+ 1.25(\theta_*x_{\text{max}})^{-2})$, and any initial $\theta_0 \in (0,
+ \theta_*)$, for any $t \in \mathbb{N}$ we have
+ \begin{align}
+ F(\theta_t) \in (\varepsilon_0\theta_*^{2}, 0.17\theta_*^{2}) \quad
+ \text{\textbf{almost surely}},
+ \end{align}
+ where $\varepsilon_0 = \min \{\frac{\eta(\theta_* x_{\text{min})^{2}-
+ 1}}{3}, 0.02\}$. And \textbf{almost surely} there are $t, k > 0 $ s.t.:
\begin{align}
- &\delta_1 < F(\theta_t) < \delta_2 \qquad \text{a.s.}\\
- &\exists T > 0,\; \forall k > T:\quad \theta_{t+2k}< 1 <
- \theta_{t+2k+1} \qquad \text{a.s.}
+ &\theta_{t+2k} \in (0.65\theta_*, (1-\varepsilon_0)\theta_*) \quad
+ \text{and}\\
+ &\theta_{t+2k+1} \in ((1-\varepsilon_0)\theta_*, 1.162\theta_*)
\end{align}
- Where $\delta_1, \delta_2, T>0$ are constants.
- TODO: give more accurate description of this garbage.
\end{proposition}
So if step sizes are large enough the \textbf{loss stabilizes} between level
sets $\delta_1$, $\delta _2$ and after some initial phase the iterates bounce
from one side of the \textbf{loss valley} to the other. Note the results
-holds \textbf{almost surely}.
+holds \textbf{almost surely}. The proof idea is first to normalize the SGD
+recursion $\theta_t \to \theta_t / \theta_*$, i.e.
+\begin{align}
+ \theta_{t+1} = \theta_t + \gamma\theta_t\left(
+ 1-\theta_t^{2} \right),
+\end{align}
+where $\gamma \sim \hat{\rho}_\gamma$ is the pushforward of $\hat{\rho}$
+under $z \mapsto \eta \theta_* z^{2}$ and then the interval that we
+considered of $\eta$, becomes that of $\gamma$ by
+$\text{supp}(\hat{\rho}_\gamma) \subseteq (1, 1.25)$. Then we devide the
+interval of $(0, 1.162)$ into $4$ regions
+\begin{align}
+ &I_0 = (0, 0.65],\\
+ &I_1 = (0.65, 1-\varepsilon),\\
+ &I_2 = (1-\varepsilon, 1),\\
+ &I_3 = (1, 1.162),
+\end{align}
+where note that we have for all $0< \varepsilon < \varepsilon_0$ the
+inequality $\gamma_{\text{min}} (2-\varepsilon) (1-\varepsilon) >2$ for
+$\gamma_\text{min} = \inf(\text{supp}\left(\hat{\rho}_\gamma \right) )$.
+It is to be shown that all iterates end up in $I_1$ and leave and come back
+to $I_1$ after 2 steps, which is divided into 4 steps
+\begin{enumerate}
+ \item There is a $t\ge 0: \theta_t \in I_1 \cup I_2 \cup I_3$
+ \item For $\theta_t \in I_3$ then $\theta_{t+1} \in I_1 \cup I_3$
+ \item For $\theta_t \in I_2$ there is a $k>0$: $\forall k' > k$ it holds
+ that $\theta_{t+2k'} \in I_2$ and $\theta_{t+2k} \in I_1$.
+ \item For $\theta_t \in I_1$, then $\forall k \ge 0$, it
+ holds that
+ $\theta_{t+2k} \in I_1$ and $\theta_{t+2k+1} \in
+ (1+\varepsilon, 1.162)$.
+\end{enumerate}
+To show 1) we it needs to be shown that the function $h_\gamma(\theta)
+=\theta - \gamma \theta(1- \theta^{2})$ for $\gamma(1,1.25)$ stays in $(0,
+1.162)$. To show 2), for $\theta \in (1, 1.162)$ note that $h_\gamma(\theta)$
+is linear in $\gamma$ for $\theta>1$, decreasing as $\gamma$ increases then
+\begin{align}
+ 0.652 = h_{1.25}(1.162) < h_\gamma(1.162) < g_\gamma(\theta) <
+ g_\gamma(1) = 1.
+\end{align}
+The points 3 \& 4 can be shown in a similar way with simple analysis.
+
+\section{SGD iteration and Stochastic Differential Equations(SDEs)}
To further understand the effect of this loss stabilization the authors
assume perfect stabilization and conjecture that during the loss
stabilization, SGD is well modeled by GD with constant label noise.
\newline
Label noise dynamics have a connection with the Stochastic Differential
-Equations (SDEs)
-TODO: Stochastic differential equations connection to SGD.
-
-To properly write a model for the stochastic differential equation (SDE)
-dynamics, it needs to be considered that the drift needs to match the
-gradient descent and the noise should have the same covariance structure. By
-Proposition \ref{prop: loss-stab} the noise at step $\theta$ is spanned by
-$\{\nabla_{\theta}h_{\theta}(x_1),\ldots,\nabla_{\theta}h_{\theta}(x_n)\}$
-and has constant intensity corresponding to the loss stabilization at $\delta
->0$. The following SDE model is proposed
+Equations (SDEs), \cite{li2018stochastic} shows the motivation in this
+approach. By rewriting the SGD iteration with
+\begin{align}
+ V_t(\theta_t, i_t) = \sqrt{\eta} \left(f(\theta_t) - \nabla
+ f_{i_t}(\theta_k) \right),
+\end{align}
+where in our case $f$ is the loss function $f_{i_k}$ is the selected function
+in the sum to estimate the gradient, then the SGD iteration takes the
+following form
+\begin{align}
+ \theta_{t+1} = \theta_t - \eta \nabla f(\theta_t) + \eta
+ V_t(\theta_t, i_t).
+\end{align}
+A straight forward calculation shows \cite{li2018stochastic}:
+\begin{align}
+ &\mathbb{E}(V_t|\theta_t) = 0, \\
+ &\text{cov}(V_t, V_t|\theta_t) = \eta \Sigma(\theta_t), \\
+ &\Sigma(\theta_k) :=
+ \mathbb{E}\left(\frac{V_t^{2}}{\eta}\Big|\theta_t \right)
+\end{align}
+On the other hand consider an time-homogeneous It\^o type SDE for $\tau>0$
\begin{align}
\label{eq: sde-sgd-dynamics}
- d\theta_t = -\nabla_{\theta} \mathcal{L}(\theta_t)dt + \sqrt{\eta\delta}
- \phi_{\theta_t}(X)^{T}dB_t,
+ d\theta\tau = b(\theta_\tau)d\tau
+ +\sqrt{\eta}\sigma(\theta_\tau)dB_\tau,
+\end{align}
+where $B_\tau$ is the standard p-dimensional Brownian motion, $\b:
+\mathbb{R}^{p}\to \mathbb{R}^{p}$ is called the drift and $\sigma:
+\mathbb{R}^{p} \to \mathbb{R}^{p\times p}$ is the diffusion matrix. Applying
+the Euler discretization scheme with step size $\eta$ and approximating
+$\theta_{\tau\eta}$ with $\hat{\eta}_\tau$ we get
+\begin{align}
+ \hat{\theta}_t= \hat{\theta}_t + \eta b(\theta_t)
+ +\eta \sigma(\hat{\theta}_t)Z_t.
\end{align}
+In a similar way setting $b = -\nabla_\theta f $ and $\sigma(\theta) =
+\Sigma(\theta)^{\frac{1}{2}}$ we get
+\begin{align}
+ d\theta_\tau = -\nabla f(\theta_\tau)d\tau
+ + (\eta\Sigma(\theta_\tau))^{\frac{1}{2}} dB_t.
+\end{align}
+And for a loss function $\mathbf{L}$ and noise at state $\theta$ spanned by
+$\{\nabla_\theta h_\theta(x_1) ,\ldots , \nabla_\theta h(x_n)\}$, loss
+stabilization occurs at some level set $\delta$ the authors of
+\cite{andriushchenko2023sgd} propose the following SDE model
+\begin{align*}
+ d\theta_\tau = -\nabla_\theta \mathcal{L}(\theta_\tau)d\tau
+ + \sqrt{\eta\delta}
+ \phi_{\theta_\tau}\left(X\right)^{T}dB_\tau,
+\end{align*}
where $(B_t)_{t\ge 0}$ is standard Brownian motion in $\mathbb{R}^{n}$ and
$\phi_{\theta}(X) := [\nabla_{\theta}h_{\theta}(x_i)^{T}]_{i=1}^{n} \in
\mathbb{R}^{n\times p}$ referred to as the Jacobian. This SDE can be
interpreted as the effective slow dynamics that dives the iterates while they
bounce rapidly in some directions at the level set $\delta$.
+
+
\section{Sparse Feature Learning}
-It is begun with a simple example on diagonal linear networks to show a
-sparsity inducing dynamics and then further disclosed to a general message
-about the implicit bias prompted by the effective dynamics.
-\newline
-A diagonal linear network is a two-layer linear network with only diagonal
-connections: the prediction function is $h_{u,v}(x) = \langle u, v \odot
-x\rangle = \langle u \odot v, x\rangle$, where $\odot$ denotes the
+In a two layer linear network with only diagonal connections, i.e. Diagonal
+Linear Network. With prediction function $h_{u, v}(x) = \langle u, v\odot
+x\rangle = \langle u \odot v, x\rangle$, where $\odot$ represents the
elementwise multiplication. In this case the loss function is convex but the
linear predictor $\beta:=u\odot v \in \mathbb{R}^{d}$ is not in $(u, v)$.
-Hence we can see from this example a rich non-convex dynamics. Then $\nabla_u
+Hence we can see from this example rich non-convex dynamics. Then $\nabla_u
h_{u, v}(x) = v \odot x$ and the SDE model is
\begin{align}
du_t = -\nabla_u \mathcal{L}(u_t, v_t) dt + \sqrt{\eta\delta}\; v_t \odot
@@ -147,8 +236,9 @@ h_{u, v}(x) = v \odot x$ and the SDE model is
where $(B_t)_{t\ge 0}$ is the standard Brownian motion in $\mathbb{R}^{n}$
and the equations are symmetric for $(v_t)_{t\ge 0}$.
-The behavior of this effective dynamics shows from (Pillaud-Vivien et al.,
-2022) that the linear predictor $\beta_t = u_t \odot v_t$
+The behavior of this effective dynamics shows from
+\cite{pillaudvivien2022label}
+that the linear predictor $\beta_t = u_t \odot v_t$
\begin{enumerate}
\item converges exponentially to zero outside of the support of $\beta^{*}$
\item is with high probability in a $\mathcal{O}(\sqrt{\eta\delta})$
@@ -162,57 +252,40 @@ loss-stabilization an effective dynamics takes place. Shrinking the
coordinates outside of the support of the sparsest signal and oscillates in
the parameter space at level $\mathcal{O}(\sqrt{\eta\delta})$ on its support.
Thereby decreasing the step size later leads to perfect recovery of the
-sparsest predictor. TODO: Experiments.
-\newline
+sparsest predictor
+
The diagonal linear nets show noisy dynamics which induce a sparcity bias,
-which is by HaoChen et al. (2021) due to the term $v_t \odot [X^{T}dB_t]$,
-which has a shrinking effect on the coordinates (due to the element wise
-multiplication). In general from Equation \ref{eq: sde-sgd-dynamics} the same
-multiplicative structure happends w.r.t. the Jacobian $\phi_\theta(X)$. This
+which is by \cite{andriushchenko2023sgd} due to the term $v_t \odot [X^{T}dB_t]$,
+this has a \textit{shrinking effect} on the coordinates (due to the element wise
+multiplication), i.e. entries collapse almost to zero.
+\newline
+
+In general from Equation \ref{eq: sde-sgd-dynamics} shows the same
+same behavior w.r.t. the Jacobian $\phi_\theta(X)$. This
suggests that the implicit bias of the noise can lead to a shrinking age
effect applied to $\phi_\theta(X)$, which depends on the noise intensity
-$\delta$ and step size of the SGD. Also note the property of the Browninan
-motions: $v \in \mathbb{R}^{p}$ then $\langle v, B_t\rangle = \|v\|_2 W_t$,
+$\delta$ and step size of the SGD.
+
+Also note the property of the Brownian
+motion: $v \in \mathbb{R}^{p}$ then $\langle v, B_t\rangle = \|v\|_2 W_t$,
where $(W_t)_{t\ge 0}$ is a one dimensional Brownian motion. Thereby the
process in Equation \ref{eq: sde-sgd-dynamics} is equivalent to the process
whose $i$-th coordinate is driven by a noise proportional to
$\|\phi_i\|dW_{t}^{i}$. This SDE structure, similar to the geometric
Brownian motion, is expected to induce the shrinking age of each
multiplicative factor $(\|\nabla_\theta h(x_i)\|)_{i=1}^{n}$, hence the
-authors conjecture: \textit{The noise part of Equation
+authors conjecture \cite{andriushchenko2023sgd}: \textit{The noise part of Equation
\ref{eq: sde-sgd-dynamics} seeks to minimize the $l_2$-norm of the columns
of $\phi_\theta(X)$.}
\newline
-Also note that the fitting part of the dynamics prevents the Jacobian to
-collapse totally to zero, but as soon as they are not needed to fit the
-signal, the columns can be reduced to zero. Now the specification of the
-implicit bias for different architectures is provided
-\begin{itemize}
- \item \textbf{Diagonal linear networks:} $h_{u, v}(x) = \langle u \odot v,
- x\rangle$ and the gradient $\nabla_{u, v}h_{u,v}= [v\odot x, u \odot
- x]$. For a generic data matrix $X$, minimizing the norm of each
- column of $\phi_{u, v}(X)$ amounts to put the maximal number of zero
- coordinates and hence to minimize $\|u \odot v\|_0$.
-
- \item \textbf{ReLU networks:} Let $h_{a, W}(x) = \langle a,
- \sigma(Wx)\rangle$, then $\nabla_{a}h_{a, W}(x) = \sigma(Wx)$ and
- $\nabla_{w_j}h_{a, W}(x) = a_j x \mathbf{1}_{\langle w_j, x\rangle >
- 0}$. The implicit bias enables the learning of sparse data-active
- features. Activated neurons align to fit reducing the rank of
- $\phi_\theta(X)$.
-\end{itemize}
-The main insight is that the Jacobian can be significantly simplified during
-the loss stabilization phase. While the gradient part tries to fit the data
-and align neurons the noise part intends to minimize the $l_2$-norm of the
-columns of $\phi(X)$. Thus the combination suggests to count the average
-number of \textbf{distinct} (counting the group of aligned neurons as one),
-\textbf{non-zero} activations over the training set. This will be referred to
-as the \textbf{feature sparsity coefficient}. In the next section the authors
-show that the conjectured sparsity is observed empirically for a variety of
-models. Where both the feature sparsity coefficient and the rank of
-$\phi_\theta (X)$ can be used as a good proxy to track the hidden progress
-during the loss stabilization phase.
+In summary the Jacobian can be simplified by collapsing because the SDE
+dynamics suggests that the noise part is trying to minimize the $l_2$ norm
+while the fitting part tries to fit the data preventing the Jacobian to
+collapse to zero. This suggests to collapse the Jacobian manually to zero
+below a certain threshold and to count the average number of
+\textbf{distinct} and \textbf{non-zero} activations over the training set,
+which is defined as the \textbf{feature sparsity coefficient}.
\section{Empirical Evidence}
In the follwing section the results for diagonal nets, deep nets, deep dense
@@ -260,7 +333,20 @@ $\beta_* \in \mathbb{R}^{d}$ is $r=20$ sparse. Four different SGD runs are
considered one with a small step size and three with initial large step size
decayed after $10\%, 30\%$ and $50\%$ of iterations, respectively.
-TODO: results and description of results
+\begin{figure}[H]
+ \centering
+ \includegraphics[width=0.8\textwidth]{./pics/dn_loss.png}
+ \includegraphics[width=0.8\textwidth]{./pics/dn_sparsity.png}
+ \includegraphics[width=\textwidth]{./pics/dn_setup.png}
+ \caption{Diagonal Linear Nets Results \label{fig: diagonal}}
+\end{figure}
+
+Figure \ref{fig: digonal} shows the results stated in the previous chapters.
+In all three cases of large step size the loss stabilizes and the feature
+sparsity progress can be seen on the rank of the Jacobian and feature
+sparsity coefficient.
+
+
\subsection{Simple ReLU Networks}
\textbf{Two Layer ReLU networks in 1D.}
@@ -271,9 +357,17 @@ stabilization is observed at around $10^{-5}$, the predictor becomes simpler
and is expected to generalize better and both the rank and the feature
sparsity coefficient decrease during loss stabilization. Because of the
low dimensionality of the example it can be directly observed that the final
-predictor is sparse in terms of the number of distinct ReLU kinks.
+predictor is sparse in terms of the number of distinct ReLU kinks \ref{fig:
+relu2}.
+
+\begin{figure}[H]
+ \centering
+ \includegraphics[width=0.8\textwidth]{./pics/relu2_loss.png}
+ \includegraphics[width=0.8\textwidth]{./pics/relu2_sparsity.png}
+ \includegraphics[width=\textwidth]{./pics/relu2_setup.png}
+ \caption{\textbf{Two Layer} ReLU: 1D regression task \label{fig: relu2}}
+\end{figure}
-TODO: results and description of results
\textbf{Three Layer ReLU networks in 1D.}
For deeper ReLU networks a teacher-student setup with a random three layer
@@ -285,7 +379,13 @@ sample size. The model is trained using SGD with a medium constant step size
and on contrast with a large step size with warmup decayed after $10\%, 30\%$
an $50\%$ of iterations, respectively.
-TODO: results and description of results
+\begin{figure}[H]
+ \centering
+ \includegraphics[width=0.8\textwidth]{./pics/relu3_loss.png}
+ \includegraphics[width=\textwidth]{./pics/relu3_sparsity.png}
+ \includegraphics[width=\textwidth]{./pics/relu3_setup.png}
+ \caption{\textbf{Three Layer} ReLU: teacher-student setup \label{ref: relu3}}
+\end{figure}
\subsection{Deep ReLU Networks}
In this section a state of the art example is considered to show the sparse
@@ -298,20 +398,23 @@ because of too large matrix dimensions, hence only the feature sparsity
coefficient is taken at two layers: end of super block 3 (middle of network)
and super block-block 4 (before average polling at the end of the network) of
DenseNets. Two cases are tested one basic setting and a state of the art
-setting with momentum and standard augmentation.
-
-TODO: Results and observations
-
-
-\section{Comparison of the Results}
-
-\section{Conclusion}
+setting with momentum and standard augmentation. In all of the cases the same
+observations are made as for the much simpler toy models, i.e. shallow ReLU
+networks and diagonal linear nets \ref{fig: deep}
+\begin{figure}[H]
+ \centering
+ \includegraphics[width=0.8\textwidth]{./pics/densen_tiny_loss.png}
+ \includegraphics[width=0.8\textwidth]{./pics/densen_tiny_sparsity.png}
+ \includegraphics[width=\textwidth]{./pics/densen_tiny_setup.png}
+ \caption{\textbf{DenseNet-100} trained on \textbf{ImageNet}
+ (Image classification Task) \label{fig: deep}}
+\end{figure}
\nocite{andriushchenko2023sgd}
\nocite{shalev2014understanding}
-\nocite{fast_armijo_2022}
+\nocite{pillaudvivien2022label}
\printbibliography
\end{document}
diff --git a/opt_sem/summary/pics/densen_tiny_loss.png b/opt_sem/summary/pics/densen_tiny_loss.png
Binary files differ.
diff --git a/opt_sem/summary/pics/densen_tiny_setup.png b/opt_sem/summary/pics/densen_tiny_setup.png
Binary files differ.
diff --git a/opt_sem/summary/pics/densen_tiny_sparsity.png b/opt_sem/summary/pics/densen_tiny_sparsity.png
Binary files differ.
diff --git a/opt_sem/summary/pics/dn_loss.png b/opt_sem/summary/pics/dn_loss.png
Binary files differ.
diff --git a/opt_sem/summary/pics/dn_setup.png b/opt_sem/summary/pics/dn_setup.png
Binary files differ.
diff --git a/opt_sem/summary/pics/dn_sparsity.png b/opt_sem/summary/pics/dn_sparsity.png
Binary files differ.
diff --git a/opt_sem/summary/pics/relu2_loss.png b/opt_sem/summary/pics/relu2_loss.png
Binary files differ.
diff --git a/opt_sem/summary/pics/relu2_setup.png b/opt_sem/summary/pics/relu2_setup.png
Binary files differ.
diff --git a/opt_sem/summary/pics/relu2_sparsity.png b/opt_sem/summary/pics/relu2_sparsity.png
Binary files differ.
diff --git a/opt_sem/summary/pics/relu3_loss.png b/opt_sem/summary/pics/relu3_loss.png
Binary files differ.
diff --git a/opt_sem/summary/pics/relu3_setup.png b/opt_sem/summary/pics/relu3_setup.png
Binary files differ.
diff --git a/opt_sem/summary/pics/relu3_sparsity.png b/opt_sem/summary/pics/relu3_sparsity.png
Binary files differ.