presentation fixed typos and added steps - notes

commit 688d349790b812e539d6dc38ebfedfd454d338b2
parent 34c75d177207a810afbd327b3c041c38545c3979
Author: miksa234 <milutin@popovic.xyz>
Date:   Wed, 17 Jan 2024 10:39:01 +0000

presentation fixed typos and added steps

Diffstat:
M opt_sem/pres/pres.tex  | 112 ++++++++++++++++++++++++++++++++++++++++----------------------------------------

1 file changed, 56 insertions(+), 56 deletions(-)
diff --git a/opt_sem/pres/pres.tex b/opt_sem/pres/pres.tex
@@ -53,37 +53,39 @@
     \end{frame}
 
     \begin{frame}{Introduction SGD}
-    Given:
+        \begin{itemize}[<+->]
+        \item[] Given:
+            \begin{itemize}
+                \item input/output samples $(x_i, y_i)_{i=1}^{n} \in
+                    \mathbb{R}^{d} \times \mathbb{R}$
+                \item Prediction functions
+                    $\mathcal{H} = \{x \mapsto h_{\theta}(x)\; | \; \theta \in
+                    \mathbb{R}^{p}\}$
+                \item setting $p \gg n$
+            \end{itemize}
+        \item[] Minimize the loss function
+        \begin{center}
+            \begin{minipage}{0.5\textwidth}
+                \begin{align*}
+                    \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1}^{n} \left(
+                    h_\theta(x_i) - y_i \right)^{2},
+                \end{align*}
+            \end{minipage}
+        \end{center}
         \begin{itemize}
-            \item input/output samples $(x_i, y_i)_{i=1}^{n} \in
-                \mathbb{R}^{d} \times \mathbb{R}$
-            \item Prediction functions
-                $\mathcal{H} = \{x \mapsto h_{\theta}(x)\; | \; \theta \in
-                \mathbb{R}^{p}\}$
-            \item setting $p \gg n$
+            \item using Stochastic Gradient Descend (SGD).
         \end{itemize}
-    Minimize the loss loss
-    \begin{center}
-        \begin{minipage}{0.5\textwidth}
-            \begin{align*}
-                \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1}^{n} \left(
-                h_\theta(x_i) - y_i \right)^{2},
-            \end{align*}
-        \end{minipage}
-    \end{center}
-    \begin{itemize}
-        \item using Stochastic Gradient Descend (SGD).
+
     \end{itemize}
     \end{frame}
 
     \begin{frame}{Introduction SGD}
 
-        \begin{itemize}
-            \item Insted of computing the gradient of each fucntion in the
+        \begin{itemize}[<+->]
+            \item Instead of computing the gradient of each function in the
                 sum \item consider $i \sim U(\{1,\ldots,n\} )$ uniformly distributed with
             \item SGD recursion for learning rate $\eta > 0$, initial $\theta_0
                 \in \mathbb{R}^{p}$, $\forall t \in \mathbb{N}$
-        \end{itemize}
         \begin{center}
             \begin{minipage}{0.5\textwidth}
                 \begin{align*}
@@ -93,12 +95,13 @@
                 \end{align*}
             \end{minipage}
         \end{center}
+        \end{itemize}
 
     \end{frame}
 
 
     \begin{frame}{GD with specific Label Noise}
-        \begin{itemize}
+        \begin{itemize}[<+->]
             \item Define a random vector $\xi_t \in \mathbb{R}^{n}$ in each iteration
                 $t \ge 0$
                 \begin{center}
@@ -149,7 +152,7 @@
     \end{frame}
 
     \begin{frame}{Quadratic Loss Stabilization}
-        \begin{itemize}
+        \begin{itemize}[<+->]
             \item Non-convex toy model:
             \item One-dimensional data $x_i$ from distribution $\hat{\rho}$
             \item linear model outputs $y_i = x_i \theta_*^{2}$
@@ -171,7 +174,7 @@
     \end{frame}
 
     \begin{frame}{Quadratic Loss Stabilization}
-        \begin{block}{\textbf{Porposition:} Loss Stabilizaion}
+        \begin{block}{\textbf{Proposition:} Loss Stabilization}
             Assume $\exists x_{\text{min}}, x_{\text{max}} > 0$ s.t.
             $\text{supp}(\hat{\rho}) \subset [x_\text{min}, x_\text{max}]$.
             Then for any $\eta \in \left( (\theta_* x_\text{min})^{-2}, 1.25
@@ -203,7 +206,7 @@
     \end{frame}
 
     \begin{frame}{Quadratic Loss Stabilization}
-        \begin{itemize}
+        \begin{itemize}[<+->]
             \item For clarity rescale $\theta_t \to \theta_t/\theta_*$
             \begin{center}
             \begin{minipage}{0.5\textwidth}
@@ -216,14 +219,14 @@
             where $\gamma \sim \hat{\rho}_{\gamma}$ is the pushforward
             $\hat{\rho}$ under $z \mapsto \eta\theta_*^{2}z^2$.
 
-        \item then the iterval of $\eta$ becomes that of
+        \item then the interval of $\eta$ becomes that of
             $\text{supp}(\hat{\rho}_{\gamma}) \subseteq (1, 1.25)$.
         \end{itemize}
     \end{frame}
 
     \begin{frame}{Quadratic Loss Stabilization}
-        \begin{itemize}
-            \item To proove the proposition devide $(0, 1.162)$ into 4
+        \begin{itemize}[<+->]
+            \item To prove the proposition divide $(0, 1.162)$ into 4
                 regions
             \item $I_0=(0, 0.65]$, $I_1=(0.65, 1-\varepsilon)$,
                 $I_2=[1-\varepsilon, 1)$ and $I_3 = [1, 1.162)$.
@@ -231,7 +234,7 @@
             \item All iterates end up in $I_1$, leave and come back to $I_1$
                 in 2 steps.
 
-            \item devided into 4 steps
+            \item divided into 4 steps
                 \begin{enumerate}
                     \item There is a $t\ge 0$: $\theta_t \in I_1\cup
                         I_2\cup I_3$
@@ -251,8 +254,8 @@
     \end{frame}
 
     \begin{frame}{Quadratic Loss Stabilization}
-        \begin{itemize}
-            \item Proof is devided into 4 steps show
+        \begin{itemize}[<+->]
+            \item Proof is divided into 4 steps show
                 \begin{enumerate}
                     \item There is a $t\ge 0$: $\theta_t \in I_1\cup
                         I_2\cup I_3$
@@ -270,17 +273,15 @@
     \end{frame}
 
     \begin{frame}{Quadratic Loss Stabilization}
-        \begin{itemize}
+        \begin{itemize}[<+->]
             \item For \textbf{1}: (There is a $t\ge 0: \theta_t \in I_1 \cup
                 I_2 \cup I_3)$
             \item Show that the function $h_\gamma(\theta) = \theta -
                 \gamma\theta(1-\theta^{2})$ for $\gamma \in (1, 1.25)$ stays
                 in $(0, 1.162)$.
-        \end{itemize}
 
         \vspace{0.5cm}
 
-        \begin{itemize}
             \item For \textbf{2}: (For $\theta_t \in I_3$, then $\theta_{t+1} \in I_1
                         \cup I_2$.)
             \item For $\theta \in (1, 1.162)$ note $h_\gamma(\theta)$ is linear in $\gamma$ when
@@ -290,11 +291,9 @@
 
             \item So $\theta_{t+1} \in I_1 \cup I_2$.
 
-        \end{itemize}
 
         \vspace{0.5cm}
 
-        \begin{itemize}
             \item \textbf{3} \& \textbf{4} are a bit more complicated but can be
                 shown also with basic analysis.
         \end{itemize}
@@ -302,11 +301,11 @@
 
 
     \begin{frame}{SGD Dynamics}
-        \begin{itemize}
+        \begin{itemize}[<+->]
             \item To further understand loss stabilization -> assume perfect
                 stabilization
-            \item Conjecure: \textit{During loss stabilizaion, SGD is well
-                modelled by GD with constant label noise}
+            \item Conjecture: \textit{During loss stabilization, SGD is well
+                modeled by GD with constant label noise}
 
             \item Label noise dynamics can be studies with Stochastic
                 Differential Equations (SDEs)
@@ -315,7 +314,7 @@
     \end{frame}
 
     \begin{frame}{SGD Dynamics Heuristics}
-        \begin{itemize}
+        \begin{itemize}[<+->]
             \item Heuristics: rewrite SGD iteration, where $V_t(\theta_t, i_t) =
                 \sqrt{\eta}\left(\nabla_\theta f(\theta_k) - \nabla
                 f_{i_t}(\theta_t)  \right)  $ is $d$-dimensional r.v.
@@ -343,7 +342,7 @@
     \end{frame}
 
     \begin{frame}{SGD Dynamics Heuristics}
-        \begin{itemize}
+        \begin{itemize}[<+->]
             \item Then consider  time-homogeneous It\^o type SDE for $\tau\ge 0$
             \begin{center}
             \begin{minipage}{0.5\textwidth}
@@ -353,7 +352,8 @@
                 \end{align*}
             \end{minipage}
             \end{center}
-            where $\theta_\tau \in \mathbb{R}^{n}$, $B_\tau$ standard p-dim. Braunian
+            where $\theta_\tau \in \mathbb{R}^{n}$, $B_\tau$ standard p-dim.
+            Brownian
             motion, $b:\mathbb{R}^{n} \to \mathbb{R}^{n}$ the drift and
             $\sigma: \mathbb{R}^{p}\to \mathbb{R}^{p\times p}$ the diffusion
             matrix
@@ -384,12 +384,12 @@
     \end{frame}
 
     \begin{frame}{SGD Dynamics Heuristics}
-        \begin{itemize}
-            \item In our case: we have a loss funtino $\mathcal{L}$
+        \begin{itemize}[<+->]
+            \item In our case: we have a loss function $\mathcal{L}$
             \item The noise at state $\theta$ is spanned by
                 $\{\nabla_\theta h_\theta(x_1), \ldots, \nabla_\theta
                 h(x_n)\} $
-            \item Loss stabilization occrus at a level set $\delta > 0$
+            \item Loss stabilization occurs at a level set $\delta > 0$
             \begin{center}
             \begin{minipage}{0.5\textwidth}
                 \begin{align*}
@@ -402,13 +402,13 @@
             \vspace{0.5cm}
             where $\phi_\theta(X) := [\nabla_\theta
             h_\theta(x_i)^{T}]_{i=1}^{n} \in \mathbb{R}^{n\times p}$ is
-            reffered to as the Jacobian.
+            referred to as the Jacobian.
         \end{itemize}
     \end{frame}
 
     \begin{frame}{Diagonal Linear Networks}
-        \begin{itemize}
-            \item Two layer linear network with onyl diagonal connections:
+        \begin{itemize}[<+->]
+            \item Two layer linear network with only diagonal connections:
             \item Prediction function:
                 $h_{u, v}(x) = \langle u, v\odot x\rangle =  \langle u \odot
                 v, x\rangle$.
@@ -428,22 +428,22 @@
     \end{frame}
 
     \begin{frame}{Diagonal Linear Networks: Conclusion}
-        \begin{itemize}
+        \begin{itemize}[<+->]
             \item During the first phase SGD with large $\eta$
                 \begin{enumerate}
-                    \item decreases the traingin loss
+                    \item decreases the training loss
                     \item stabilizes at some level set $\delta$
                 \end{enumerate}
-            \item Meanwhile tehere is an effective noise-driven dynamics
-                which oscilates on $\mathcal{O}(\sqrt{\eta\delta})$
+            \item Meanwhile there is an effective noise-driven dynamics
+                which oscillates on $\mathcal{O}(\sqrt{\eta\delta})$
             \item Decreasing the step size later leads to recovery of the
                 sparsest predictor.
         \end{itemize}
     \end{frame}
 
     \begin{frame}{Sparsity Coefficient}
-         \begin{itemize}
-             \item SDE dynamics of diagonal networks emphesizes that there is
+        \begin{itemize}[<+->]
+             \item SDE dynamics of diagonal networks emphasizes that there is
                  a sparsity bias because of $v \odot [X^{T}dB_\tau]$ leading
                  to a coordinate shrinkage effect
             \item Generally the same thing happens w.r.t the Jacobian
@@ -466,8 +466,8 @@
     \end{frame}
 
     \begin{frame}{DLN Feature Sparsity Coefficient}
-        \begin{itemize}
-            \item The jacobian can be simplified during loss stabilization.
+        \begin{itemize}[<+->]
+            \item The Jacobian can be simplified during loss stabilization.
             \item Motivation: count the average number of distinct, nonzero
                 activations over the training set = \textbf{feature sparsity
                 coefficient}

	notes uni notes
	git clone git://popovic.xyz/notes.git
	Log \| Files \| Refs