notes

uni notes
git clone git://popovic.xyz/notes.git
Log | Files | Refs

commit 688d349790b812e539d6dc38ebfedfd454d338b2
parent 34c75d177207a810afbd327b3c041c38545c3979
Author: miksa234 <milutin@popovic.xyz>
Date:   Wed, 17 Jan 2024 10:39:01 +0000

presentation fixed typos and added steps

Diffstat:
Mopt_sem/pres/pres.tex | 112++++++++++++++++++++++++++++++++++++++++----------------------------------------
1 file changed, 56 insertions(+), 56 deletions(-)

diff --git a/opt_sem/pres/pres.tex b/opt_sem/pres/pres.tex @@ -53,37 +53,39 @@ \end{frame} \begin{frame}{Introduction SGD} - Given: + \begin{itemize}[<+->] + \item[] Given: + \begin{itemize} + \item input/output samples $(x_i, y_i)_{i=1}^{n} \in + \mathbb{R}^{d} \times \mathbb{R}$ + \item Prediction functions + $\mathcal{H} = \{x \mapsto h_{\theta}(x)\; | \; \theta \in + \mathbb{R}^{p}\}$ + \item setting $p \gg n$ + \end{itemize} + \item[] Minimize the loss function + \begin{center} + \begin{minipage}{0.5\textwidth} + \begin{align*} + \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1}^{n} \left( + h_\theta(x_i) - y_i \right)^{2}, + \end{align*} + \end{minipage} + \end{center} \begin{itemize} - \item input/output samples $(x_i, y_i)_{i=1}^{n} \in - \mathbb{R}^{d} \times \mathbb{R}$ - \item Prediction functions - $\mathcal{H} = \{x \mapsto h_{\theta}(x)\; | \; \theta \in - \mathbb{R}^{p}\}$ - \item setting $p \gg n$ + \item using Stochastic Gradient Descend (SGD). \end{itemize} - Minimize the loss loss - \begin{center} - \begin{minipage}{0.5\textwidth} - \begin{align*} - \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1}^{n} \left( - h_\theta(x_i) - y_i \right)^{2}, - \end{align*} - \end{minipage} - \end{center} - \begin{itemize} - \item using Stochastic Gradient Descend (SGD). + \end{itemize} \end{frame} \begin{frame}{Introduction SGD} - \begin{itemize} - \item Insted of computing the gradient of each fucntion in the + \begin{itemize}[<+->] + \item Instead of computing the gradient of each function in the sum \item consider $i \sim U(\{1,\ldots,n\} )$ uniformly distributed with \item SGD recursion for learning rate $\eta > 0$, initial $\theta_0 \in \mathbb{R}^{p}$, $\forall t \in \mathbb{N}$ - \end{itemize} \begin{center} \begin{minipage}{0.5\textwidth} \begin{align*} @@ -93,12 +95,13 @@ \end{align*} \end{minipage} \end{center} + \end{itemize} \end{frame} \begin{frame}{GD with specific Label Noise} - \begin{itemize} + \begin{itemize}[<+->] \item Define a random vector $\xi_t \in \mathbb{R}^{n}$ in each iteration $t \ge 0$ \begin{center} @@ -149,7 +152,7 @@ \end{frame} \begin{frame}{Quadratic Loss Stabilization} - \begin{itemize} + \begin{itemize}[<+->] \item Non-convex toy model: \item One-dimensional data $x_i$ from distribution $\hat{\rho}$ \item linear model outputs $y_i = x_i \theta_*^{2}$ @@ -171,7 +174,7 @@ \end{frame} \begin{frame}{Quadratic Loss Stabilization} - \begin{block}{\textbf{Porposition:} Loss Stabilizaion} + \begin{block}{\textbf{Proposition:} Loss Stabilization} Assume $\exists x_{\text{min}}, x_{\text{max}} > 0$ s.t. $\text{supp}(\hat{\rho}) \subset [x_\text{min}, x_\text{max}]$. Then for any $\eta \in \left( (\theta_* x_\text{min})^{-2}, 1.25 @@ -203,7 +206,7 @@ \end{frame} \begin{frame}{Quadratic Loss Stabilization} - \begin{itemize} + \begin{itemize}[<+->] \item For clarity rescale $\theta_t \to \theta_t/\theta_*$ \begin{center} \begin{minipage}{0.5\textwidth} @@ -216,14 +219,14 @@ where $\gamma \sim \hat{\rho}_{\gamma}$ is the pushforward $\hat{\rho}$ under $z \mapsto \eta\theta_*^{2}z^2$. - \item then the iterval of $\eta$ becomes that of + \item then the interval of $\eta$ becomes that of $\text{supp}(\hat{\rho}_{\gamma}) \subseteq (1, 1.25)$. \end{itemize} \end{frame} \begin{frame}{Quadratic Loss Stabilization} - \begin{itemize} - \item To proove the proposition devide $(0, 1.162)$ into 4 + \begin{itemize}[<+->] + \item To prove the proposition divide $(0, 1.162)$ into 4 regions \item $I_0=(0, 0.65]$, $I_1=(0.65, 1-\varepsilon)$, $I_2=[1-\varepsilon, 1)$ and $I_3 = [1, 1.162)$. @@ -231,7 +234,7 @@ \item All iterates end up in $I_1$, leave and come back to $I_1$ in 2 steps. - \item devided into 4 steps + \item divided into 4 steps \begin{enumerate} \item There is a $t\ge 0$: $\theta_t \in I_1\cup I_2\cup I_3$ @@ -251,8 +254,8 @@ \end{frame} \begin{frame}{Quadratic Loss Stabilization} - \begin{itemize} - \item Proof is devided into 4 steps show + \begin{itemize}[<+->] + \item Proof is divided into 4 steps show \begin{enumerate} \item There is a $t\ge 0$: $\theta_t \in I_1\cup I_2\cup I_3$ @@ -270,17 +273,15 @@ \end{frame} \begin{frame}{Quadratic Loss Stabilization} - \begin{itemize} + \begin{itemize}[<+->] \item For \textbf{1}: (There is a $t\ge 0: \theta_t \in I_1 \cup I_2 \cup I_3)$ \item Show that the function $h_\gamma(\theta) = \theta - \gamma\theta(1-\theta^{2})$ for $\gamma \in (1, 1.25)$ stays in $(0, 1.162)$. - \end{itemize} \vspace{0.5cm} - \begin{itemize} \item For \textbf{2}: (For $\theta_t \in I_3$, then $\theta_{t+1} \in I_1 \cup I_2$.) \item For $\theta \in (1, 1.162)$ note $h_\gamma(\theta)$ is linear in $\gamma$ when @@ -290,11 +291,9 @@ \item So $\theta_{t+1} \in I_1 \cup I_2$. - \end{itemize} \vspace{0.5cm} - \begin{itemize} \item \textbf{3} \& \textbf{4} are a bit more complicated but can be shown also with basic analysis. \end{itemize} @@ -302,11 +301,11 @@ \begin{frame}{SGD Dynamics} - \begin{itemize} + \begin{itemize}[<+->] \item To further understand loss stabilization -> assume perfect stabilization - \item Conjecure: \textit{During loss stabilizaion, SGD is well - modelled by GD with constant label noise} + \item Conjecture: \textit{During loss stabilization, SGD is well + modeled by GD with constant label noise} \item Label noise dynamics can be studies with Stochastic Differential Equations (SDEs) @@ -315,7 +314,7 @@ \end{frame} \begin{frame}{SGD Dynamics Heuristics} - \begin{itemize} + \begin{itemize}[<+->] \item Heuristics: rewrite SGD iteration, where $V_t(\theta_t, i_t) = \sqrt{\eta}\left(\nabla_\theta f(\theta_k) - \nabla f_{i_t}(\theta_t) \right) $ is $d$-dimensional r.v. @@ -343,7 +342,7 @@ \end{frame} \begin{frame}{SGD Dynamics Heuristics} - \begin{itemize} + \begin{itemize}[<+->] \item Then consider time-homogeneous It\^o type SDE for $\tau\ge 0$ \begin{center} \begin{minipage}{0.5\textwidth} @@ -353,7 +352,8 @@ \end{align*} \end{minipage} \end{center} - where $\theta_\tau \in \mathbb{R}^{n}$, $B_\tau$ standard p-dim. Braunian + where $\theta_\tau \in \mathbb{R}^{n}$, $B_\tau$ standard p-dim. + Brownian motion, $b:\mathbb{R}^{n} \to \mathbb{R}^{n}$ the drift and $\sigma: \mathbb{R}^{p}\to \mathbb{R}^{p\times p}$ the diffusion matrix @@ -384,12 +384,12 @@ \end{frame} \begin{frame}{SGD Dynamics Heuristics} - \begin{itemize} - \item In our case: we have a loss funtino $\mathcal{L}$ + \begin{itemize}[<+->] + \item In our case: we have a loss function $\mathcal{L}$ \item The noise at state $\theta$ is spanned by $\{\nabla_\theta h_\theta(x_1), \ldots, \nabla_\theta h(x_n)\} $ - \item Loss stabilization occrus at a level set $\delta > 0$ + \item Loss stabilization occurs at a level set $\delta > 0$ \begin{center} \begin{minipage}{0.5\textwidth} \begin{align*} @@ -402,13 +402,13 @@ \vspace{0.5cm} where $\phi_\theta(X) := [\nabla_\theta h_\theta(x_i)^{T}]_{i=1}^{n} \in \mathbb{R}^{n\times p}$ is - reffered to as the Jacobian. + referred to as the Jacobian. \end{itemize} \end{frame} \begin{frame}{Diagonal Linear Networks} - \begin{itemize} - \item Two layer linear network with onyl diagonal connections: + \begin{itemize}[<+->] + \item Two layer linear network with only diagonal connections: \item Prediction function: $h_{u, v}(x) = \langle u, v\odot x\rangle = \langle u \odot v, x\rangle$. @@ -428,22 +428,22 @@ \end{frame} \begin{frame}{Diagonal Linear Networks: Conclusion} - \begin{itemize} + \begin{itemize}[<+->] \item During the first phase SGD with large $\eta$ \begin{enumerate} - \item decreases the traingin loss + \item decreases the training loss \item stabilizes at some level set $\delta$ \end{enumerate} - \item Meanwhile tehere is an effective noise-driven dynamics - which oscilates on $\mathcal{O}(\sqrt{\eta\delta})$ + \item Meanwhile there is an effective noise-driven dynamics + which oscillates on $\mathcal{O}(\sqrt{\eta\delta})$ \item Decreasing the step size later leads to recovery of the sparsest predictor. \end{itemize} \end{frame} \begin{frame}{Sparsity Coefficient} - \begin{itemize} - \item SDE dynamics of diagonal networks emphesizes that there is + \begin{itemize}[<+->] + \item SDE dynamics of diagonal networks emphasizes that there is a sparsity bias because of $v \odot [X^{T}dB_\tau]$ leading to a coordinate shrinkage effect \item Generally the same thing happens w.r.t the Jacobian @@ -466,8 +466,8 @@ \end{frame} \begin{frame}{DLN Feature Sparsity Coefficient} - \begin{itemize} - \item The jacobian can be simplified during loss stabilization. + \begin{itemize}[<+->] + \item The Jacobian can be simplified during loss stabilization. \item Motivation: count the average number of distinct, nonzero activations over the training set = \textbf{feature sparsity coefficient}