commit 688d349790b812e539d6dc38ebfedfd454d338b2
parent 34c75d177207a810afbd327b3c041c38545c3979
Author: miksa234 <milutin@popovic.xyz>
Date: Wed, 17 Jan 2024 10:39:01 +0000
presentation fixed typos and added steps
Diffstat:
| M | opt_sem/pres/pres.tex | | | 112 | ++++++++++++++++++++++++++++++++++++++++---------------------------------------- |
1 file changed, 56 insertions(+), 56 deletions(-)
diff --git a/opt_sem/pres/pres.tex b/opt_sem/pres/pres.tex
@@ -53,37 +53,39 @@
\end{frame}
\begin{frame}{Introduction SGD}
- Given:
+ \begin{itemize}[<+->]
+ \item[] Given:
+ \begin{itemize}
+ \item input/output samples $(x_i, y_i)_{i=1}^{n} \in
+ \mathbb{R}^{d} \times \mathbb{R}$
+ \item Prediction functions
+ $\mathcal{H} = \{x \mapsto h_{\theta}(x)\; | \; \theta \in
+ \mathbb{R}^{p}\}$
+ \item setting $p \gg n$
+ \end{itemize}
+ \item[] Minimize the loss function
+ \begin{center}
+ \begin{minipage}{0.5\textwidth}
+ \begin{align*}
+ \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1}^{n} \left(
+ h_\theta(x_i) - y_i \right)^{2},
+ \end{align*}
+ \end{minipage}
+ \end{center}
\begin{itemize}
- \item input/output samples $(x_i, y_i)_{i=1}^{n} \in
- \mathbb{R}^{d} \times \mathbb{R}$
- \item Prediction functions
- $\mathcal{H} = \{x \mapsto h_{\theta}(x)\; | \; \theta \in
- \mathbb{R}^{p}\}$
- \item setting $p \gg n$
+ \item using Stochastic Gradient Descend (SGD).
\end{itemize}
- Minimize the loss loss
- \begin{center}
- \begin{minipage}{0.5\textwidth}
- \begin{align*}
- \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1}^{n} \left(
- h_\theta(x_i) - y_i \right)^{2},
- \end{align*}
- \end{minipage}
- \end{center}
- \begin{itemize}
- \item using Stochastic Gradient Descend (SGD).
+
\end{itemize}
\end{frame}
\begin{frame}{Introduction SGD}
- \begin{itemize}
- \item Insted of computing the gradient of each fucntion in the
+ \begin{itemize}[<+->]
+ \item Instead of computing the gradient of each function in the
sum \item consider $i \sim U(\{1,\ldots,n\} )$ uniformly distributed with
\item SGD recursion for learning rate $\eta > 0$, initial $\theta_0
\in \mathbb{R}^{p}$, $\forall t \in \mathbb{N}$
- \end{itemize}
\begin{center}
\begin{minipage}{0.5\textwidth}
\begin{align*}
@@ -93,12 +95,13 @@
\end{align*}
\end{minipage}
\end{center}
+ \end{itemize}
\end{frame}
\begin{frame}{GD with specific Label Noise}
- \begin{itemize}
+ \begin{itemize}[<+->]
\item Define a random vector $\xi_t \in \mathbb{R}^{n}$ in each iteration
$t \ge 0$
\begin{center}
@@ -149,7 +152,7 @@
\end{frame}
\begin{frame}{Quadratic Loss Stabilization}
- \begin{itemize}
+ \begin{itemize}[<+->]
\item Non-convex toy model:
\item One-dimensional data $x_i$ from distribution $\hat{\rho}$
\item linear model outputs $y_i = x_i \theta_*^{2}$
@@ -171,7 +174,7 @@
\end{frame}
\begin{frame}{Quadratic Loss Stabilization}
- \begin{block}{\textbf{Porposition:} Loss Stabilizaion}
+ \begin{block}{\textbf{Proposition:} Loss Stabilization}
Assume $\exists x_{\text{min}}, x_{\text{max}} > 0$ s.t.
$\text{supp}(\hat{\rho}) \subset [x_\text{min}, x_\text{max}]$.
Then for any $\eta \in \left( (\theta_* x_\text{min})^{-2}, 1.25
@@ -203,7 +206,7 @@
\end{frame}
\begin{frame}{Quadratic Loss Stabilization}
- \begin{itemize}
+ \begin{itemize}[<+->]
\item For clarity rescale $\theta_t \to \theta_t/\theta_*$
\begin{center}
\begin{minipage}{0.5\textwidth}
@@ -216,14 +219,14 @@
where $\gamma \sim \hat{\rho}_{\gamma}$ is the pushforward
$\hat{\rho}$ under $z \mapsto \eta\theta_*^{2}z^2$.
- \item then the iterval of $\eta$ becomes that of
+ \item then the interval of $\eta$ becomes that of
$\text{supp}(\hat{\rho}_{\gamma}) \subseteq (1, 1.25)$.
\end{itemize}
\end{frame}
\begin{frame}{Quadratic Loss Stabilization}
- \begin{itemize}
- \item To proove the proposition devide $(0, 1.162)$ into 4
+ \begin{itemize}[<+->]
+ \item To prove the proposition divide $(0, 1.162)$ into 4
regions
\item $I_0=(0, 0.65]$, $I_1=(0.65, 1-\varepsilon)$,
$I_2=[1-\varepsilon, 1)$ and $I_3 = [1, 1.162)$.
@@ -231,7 +234,7 @@
\item All iterates end up in $I_1$, leave and come back to $I_1$
in 2 steps.
- \item devided into 4 steps
+ \item divided into 4 steps
\begin{enumerate}
\item There is a $t\ge 0$: $\theta_t \in I_1\cup
I_2\cup I_3$
@@ -251,8 +254,8 @@
\end{frame}
\begin{frame}{Quadratic Loss Stabilization}
- \begin{itemize}
- \item Proof is devided into 4 steps show
+ \begin{itemize}[<+->]
+ \item Proof is divided into 4 steps show
\begin{enumerate}
\item There is a $t\ge 0$: $\theta_t \in I_1\cup
I_2\cup I_3$
@@ -270,17 +273,15 @@
\end{frame}
\begin{frame}{Quadratic Loss Stabilization}
- \begin{itemize}
+ \begin{itemize}[<+->]
\item For \textbf{1}: (There is a $t\ge 0: \theta_t \in I_1 \cup
I_2 \cup I_3)$
\item Show that the function $h_\gamma(\theta) = \theta -
\gamma\theta(1-\theta^{2})$ for $\gamma \in (1, 1.25)$ stays
in $(0, 1.162)$.
- \end{itemize}
\vspace{0.5cm}
- \begin{itemize}
\item For \textbf{2}: (For $\theta_t \in I_3$, then $\theta_{t+1} \in I_1
\cup I_2$.)
\item For $\theta \in (1, 1.162)$ note $h_\gamma(\theta)$ is linear in $\gamma$ when
@@ -290,11 +291,9 @@
\item So $\theta_{t+1} \in I_1 \cup I_2$.
- \end{itemize}
\vspace{0.5cm}
- \begin{itemize}
\item \textbf{3} \& \textbf{4} are a bit more complicated but can be
shown also with basic analysis.
\end{itemize}
@@ -302,11 +301,11 @@
\begin{frame}{SGD Dynamics}
- \begin{itemize}
+ \begin{itemize}[<+->]
\item To further understand loss stabilization -> assume perfect
stabilization
- \item Conjecure: \textit{During loss stabilizaion, SGD is well
- modelled by GD with constant label noise}
+ \item Conjecture: \textit{During loss stabilization, SGD is well
+ modeled by GD with constant label noise}
\item Label noise dynamics can be studies with Stochastic
Differential Equations (SDEs)
@@ -315,7 +314,7 @@
\end{frame}
\begin{frame}{SGD Dynamics Heuristics}
- \begin{itemize}
+ \begin{itemize}[<+->]
\item Heuristics: rewrite SGD iteration, where $V_t(\theta_t, i_t) =
\sqrt{\eta}\left(\nabla_\theta f(\theta_k) - \nabla
f_{i_t}(\theta_t) \right) $ is $d$-dimensional r.v.
@@ -343,7 +342,7 @@
\end{frame}
\begin{frame}{SGD Dynamics Heuristics}
- \begin{itemize}
+ \begin{itemize}[<+->]
\item Then consider time-homogeneous It\^o type SDE for $\tau\ge 0$
\begin{center}
\begin{minipage}{0.5\textwidth}
@@ -353,7 +352,8 @@
\end{align*}
\end{minipage}
\end{center}
- where $\theta_\tau \in \mathbb{R}^{n}$, $B_\tau$ standard p-dim. Braunian
+ where $\theta_\tau \in \mathbb{R}^{n}$, $B_\tau$ standard p-dim.
+ Brownian
motion, $b:\mathbb{R}^{n} \to \mathbb{R}^{n}$ the drift and
$\sigma: \mathbb{R}^{p}\to \mathbb{R}^{p\times p}$ the diffusion
matrix
@@ -384,12 +384,12 @@
\end{frame}
\begin{frame}{SGD Dynamics Heuristics}
- \begin{itemize}
- \item In our case: we have a loss funtino $\mathcal{L}$
+ \begin{itemize}[<+->]
+ \item In our case: we have a loss function $\mathcal{L}$
\item The noise at state $\theta$ is spanned by
$\{\nabla_\theta h_\theta(x_1), \ldots, \nabla_\theta
h(x_n)\} $
- \item Loss stabilization occrus at a level set $\delta > 0$
+ \item Loss stabilization occurs at a level set $\delta > 0$
\begin{center}
\begin{minipage}{0.5\textwidth}
\begin{align*}
@@ -402,13 +402,13 @@
\vspace{0.5cm}
where $\phi_\theta(X) := [\nabla_\theta
h_\theta(x_i)^{T}]_{i=1}^{n} \in \mathbb{R}^{n\times p}$ is
- reffered to as the Jacobian.
+ referred to as the Jacobian.
\end{itemize}
\end{frame}
\begin{frame}{Diagonal Linear Networks}
- \begin{itemize}
- \item Two layer linear network with onyl diagonal connections:
+ \begin{itemize}[<+->]
+ \item Two layer linear network with only diagonal connections:
\item Prediction function:
$h_{u, v}(x) = \langle u, v\odot x\rangle = \langle u \odot
v, x\rangle$.
@@ -428,22 +428,22 @@
\end{frame}
\begin{frame}{Diagonal Linear Networks: Conclusion}
- \begin{itemize}
+ \begin{itemize}[<+->]
\item During the first phase SGD with large $\eta$
\begin{enumerate}
- \item decreases the traingin loss
+ \item decreases the training loss
\item stabilizes at some level set $\delta$
\end{enumerate}
- \item Meanwhile tehere is an effective noise-driven dynamics
- which oscilates on $\mathcal{O}(\sqrt{\eta\delta})$
+ \item Meanwhile there is an effective noise-driven dynamics
+ which oscillates on $\mathcal{O}(\sqrt{\eta\delta})$
\item Decreasing the step size later leads to recovery of the
sparsest predictor.
\end{itemize}
\end{frame}
\begin{frame}{Sparsity Coefficient}
- \begin{itemize}
- \item SDE dynamics of diagonal networks emphesizes that there is
+ \begin{itemize}[<+->]
+ \item SDE dynamics of diagonal networks emphasizes that there is
a sparsity bias because of $v \odot [X^{T}dB_\tau]$ leading
to a coordinate shrinkage effect
\item Generally the same thing happens w.r.t the Jacobian
@@ -466,8 +466,8 @@
\end{frame}
\begin{frame}{DLN Feature Sparsity Coefficient}
- \begin{itemize}
- \item The jacobian can be simplified during loss stabilization.
+ \begin{itemize}[<+->]
+ \item The Jacobian can be simplified during loss stabilization.
\item Motivation: count the average number of distinct, nonzero
activations over the training set = \textbf{feature sparsity
coefficient}