pres.tex (21101B)
1 \documentclass[fleqn]{beamer} 2 \beamertemplatenavigationsymbolsempty 3 4 \usepackage[T1]{fontenc} 5 \usepackage[utf8]{inputenc} 6 7 \usepackage{amsmath,amssymb} 8 \usepackage{graphicx} 9 \usepackage{mathptmx} 10 \usepackage{subcaption} 11 \usepackage{amsthm} 12 \usepackage{tikz} 13 %\usepackage[colorlinks=true,naturalnames=true,plainpages=false,pdfpagelabels=true]{hyperref} 14 \usetikzlibrary{patterns,decorations.pathmorphing,positioning, arrows, chains} 15 16 \usepackage[backend=biber, sorting=none]{biblatex} 17 \addbibresource{cite.bib} 18 19 \setbeamertemplate{endpage}{% 20 \begin{frame} 21 \centering 22 \Large \emph{Thank You!} 23 \end{frame} 24 } 25 26 \AtEndDocument{\usebeamertemplate{endpage}} 27 28 % vertical separator macro 29 \newcommand{\vsep}{ 30 \column{0.0\textwidth} 31 \begin{tikzpicture} 32 \draw[very thick,black!10] (0,0) -- (0,7.3); 33 \end{tikzpicture} 34 } 35 \setlength{\mathindent}{0pt} 36 37 % Beamer theme 38 \usetheme{UniVienna} 39 \usefonttheme[onlysmall]{structurebold} 40 \mode<presentation> 41 \setbeamercovered{transparent=10} 42 43 \title 44 {SGD with Large Step Sizes Learns Sparse Features} 45 \subtitle{Seminar Optimization} 46 \author[Popović Milutin] 47 {Popović Milutin\newline\newline Supervisor: Radu Ioan Bot} 48 \date{23. January 2024} 49 50 \begin{document} 51 \begin{frame} 52 \maketitle 53 \end{frame} 54 55 \begin{frame}{Introduction SGD} 56 \begin{itemize}[<+->] 57 \item[] Given: 58 \begin{itemize} 59 \item input/output samples $(x_i, y_i)_{i=1}^{n} \in 60 \mathbb{R}^{d} \times \mathbb{R}$ 61 \item Prediction functions 62 $\mathcal{H} = \{x \mapsto h_{\theta}(x)\; | \; \theta \in 63 \mathbb{R}^{p}\}$ 64 \item setting $p \gg n$ 65 \end{itemize} 66 \item[] Minimize the loss function 67 \begin{center} 68 \begin{minipage}{0.5\textwidth} 69 \begin{align*} 70 \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1}^{n} \left( 71 h_\theta(x_i) - y_i \right)^{2}, 72 \end{align*} 73 \end{minipage} 74 \end{center} 75 \begin{itemize} 76 \item using Stochastic Gradient Descend (SGD). 77 \end{itemize} 78 79 \end{itemize} 80 \end{frame} 81 82 \begin{frame}{Introduction SGD} 83 84 \begin{itemize}[<+->] 85 \item Instead of computing the gradient of each function in the 86 sum \item consider $i \sim U(\{1,\ldots,n\} )$ uniformly distributed with 87 \item SGD recursion for learning rate $\eta > 0$, initial $\theta_0 88 \in \mathbb{R}^{p}$, $\forall t \in \mathbb{N}$ 89 \begin{center} 90 \begin{minipage}{0.5\textwidth} 91 \begin{align*} 92 \theta_{t+1} = \theta_t - \eta\left(h_{\theta_t}(x_{i_t}) 93 - y_{i_t}\right) 94 \nabla_{\theta} h_{\theta_t}(x_{i_t}). 95 \end{align*} 96 \end{minipage} 97 \end{center} 98 \end{itemize} 99 100 \end{frame} 101 102 103 \begin{frame}{GD with specific Label Noise} 104 \begin{itemize}[<+->] 105 \item Define a random vector $\xi_t \in \mathbb{R}^{n}$ in each iteration 106 $t \ge 0$ 107 \begin{center} 108 \begin{minipage}{0.5\textwidth} 109 \begin{align*} 110 [\xi_t]_i := (h_{\theta_t}(x_i) - 111 y_i)(1-n\mathbf{1}_{i=i_t}) 112 \end{align*} 113 \end{minipage} 114 \end{center} 115 \vspace{0.5cm} 116 \item Then $(\theta_t)_{t\ge0}$ follows the full-batch gradient 117 dynamics on $\mathcal{L}$ 118 \begin{center} 119 \begin{minipage}{0.5\textwidth} 120 \begin{align*} 121 \theta_{t+1} = \theta_t - \frac{\eta}{n} \sum_{i=1}^{n} 122 \left( h_{\theta_t}(x_i) - y_i^{t} \right) 123 \nabla_{\theta} h_{\theta_t}(x_i), 124 \end{align*} 125 \end{minipage} 126 \end{center} 127 where $y^{t}:= y + \xi_t$. 128 \item Note: $\xi_t$ is mean zero r.v. 129 \item Note: Variance of $\xi_t$ is 130 $\frac{1}{n(n-1)}\mathbb{E}\|\xi_t\|^{2} = 2 131 \mathcal{L}(\theta_t)$. 132 \end{itemize} 133 \end{frame} 134 135 \begin{frame}{GD with specific Label Noise} 136 \begin{center} 137 \begin{minipage}{0.5\textwidth} 138 \begin{align*} 139 \theta_{t+1} = \theta_t - \frac{\eta}{n} \sum_{i=1}^{n} 140 \left( h_{\theta_t}(x_i) - y_i^{t} \right) 141 \nabla_{\theta} h_{\theta_t}(x_i), 142 \end{align*} 143 \end{minipage} 144 \end{center} 145 \begin{itemize} 146 \item The noisy part at state $\theta$ belongs to the space 147 spanned by $\{\nabla_\theta 148 h_\theta(x_1),\ldots,\nabla_\theta h_\theta(x_n)\} $ 149 \item During loss stabilization of large step sizes: may lead to 150 a constant effective scale of label noise. 151 \end{itemize} 152 \end{frame} 153 154 \begin{frame}{Quadratic Loss Stabilization} 155 \begin{itemize}[<+->] 156 \item Non-convex toy model: 157 \item One-dimensional data $x_i$ from distribution $\hat{\rho}$ 158 \item linear model outputs $y_i = x_i \theta_*^{2}$ 159 \item Quadratic loss $F(\theta) := 160 \frac{1}{4}\mathbb{E}_{\hat{\rho}}(y-x\theta^{2})^{2}$ with 161 SGD recursion for $\eta > 0$, $t \in \mathbb{N}$: 162 \begin{center} 163 \begin{minipage}{0.5\textwidth} 164 \begin{align*} 165 \theta_{t+1} = \theta_t + \eta \theta_t x_{i_t}(y_{i_t} - 166 x_{i_t}\theta^{2}_t) 167 \end{align*} 168 \end{minipage} 169 \end{center} 170 171 172 \end{itemize} 173 174 \end{frame} 175 176 \begin{frame}{Quadratic Loss Stabilization} 177 \begin{block}{\textbf{Proposition:} Loss Stabilization} 178 Assume $\exists x_{\text{min}}, x_{\text{max}} > 0$ s.t. 179 $\text{supp}(\hat{\rho}) \subset [x_\text{min}, x_\text{max}]$. 180 Then for any $\eta \in \left( (\theta_* x_\text{min})^{-2}, 1.25 181 (\theta_*x_\text{max})^{-2}\right)$, any initial $\theta_{0} \in 182 (0, \theta_*)$, for all $t \in \mathbb{N}$ we have that 183 \textbf{almost surely} 184 \begin{center} 185 \begin{minipage}{0.5\textwidth} 186 \begin{align*} 187 F(\theta_t) \in \left(\varepsilon_0 \theta_*^{2},\; 0.17 188 \theta_*^{2}\right), 189 \end{align*} 190 \end{minipage} 191 \end{center} 192 where $\varepsilon_0 := \min 193 \{\frac{\eta(\theta_*x_\text{min})^{2}-1}{3},\; 0.02\}$. 194 \newline 195 And \textbf{almost surely} there exists $t, k >0$ s.t. 196 \begin{center} 197 \begin{minipage}{0.5\textwidth} 198 \begin{align*} 199 &\theta_{t+2k} \in (0.65\theta_*, (1-\varepsilon_0)\theta_*) \quad 200 \text{and}\\ 201 &\theta_{t+2k+1} \in ((1-\varepsilon_0)\theta_*, 1.162\theta_*) 202 \end{align*} 203 \end{minipage} 204 \end{center} 205 \end{block} 206 \end{frame} 207 208 \begin{frame}{Quadratic Loss Stabilization} 209 \begin{itemize}[<+->] 210 \item For clarity rescale $\theta_t \to \theta_t/\theta_*$ 211 \begin{center} 212 \begin{minipage}{0.5\textwidth} 213 \begin{align*} 214 \theta_{t+1} = \theta_t + \gamma\theta_t\left( 215 1-\theta_t^{2} \right), 216 \end{align*} 217 \end{minipage} 218 \end{center} 219 where $\gamma \sim \hat{\rho}_{\gamma}$ is the pushforward 220 $\hat{\rho}$ under $z \mapsto \eta\theta_*^{2}z^2$. 221 222 \item then the interval of $\eta$ becomes that of 223 $\text{supp}(\hat{\rho}_{\gamma}) \subseteq (1, 1.25)$. 224 \end{itemize} 225 \end{frame} 226 227 \begin{frame}{Quadratic Loss Stabilization} 228 \begin{itemize}[<+->] 229 \item To prove the proposition divide $(0, 1.162)$ into 4 230 regions 231 \item $I_0=(0, 0.65]$, $I_1=(0.65, 1-\varepsilon)$, 232 $I_2=[1-\varepsilon, 1)$ and $I_3 = [1, 1.162)$. 233 234 \item All iterates end up in $I_1$, leave and come back to $I_1$ 235 in 2 steps. 236 237 \item divided into 4 steps 238 \begin{enumerate} 239 \item There is a $t\ge 0$: $\theta_t \in I_1\cup 240 I_2\cup I_3$ 241 \item For $\theta_t \in I_3$, then $\theta_{t+1} \in I_1 242 \cup I_2$. 243 \item For $\theta_t \in I_2$, then there is a $k>0$: 244 $\forall k' < k$, it holds that $\theta_{t+2k'} \in 245 I_2$ and $\theta_{t+2k} \in I_1$ 246 \item For $\theta_t \in I_1$, then $\forall k \ge 0$, it 247 holds that 248 $\theta_{t+2k} \in I_1$ and $\theta_{t+2k+1} \in 249 (1+\varepsilon, 1.162)$. 250 \end{enumerate} 251 252 253 \end{itemize} 254 \end{frame} 255 256 \begin{frame}{Quadratic Loss Stabilization} 257 \begin{itemize}[<+->] 258 \item Proof is divided into 4 steps show 259 \begin{enumerate} 260 \item There is a $t\ge 0$: $\theta_t \in I_1\cup 261 I_2\cup I_3$ 262 \item For $\theta_t \in I_3$, then $\theta_{t+1} \in I_1 263 \cup I_2$. 264 \item For $\theta_t \in I_2$, then there is a $k>0$: 265 $\forall k' < k$, it holds that $\theta{t+2k'} \in 266 I_2$ and $\theta_{t+2k} \in I_1$ 267 \item For $\theta_t \in I_1$, then $\forall k \ge 0$, it 268 holds that 269 $\theta_{t+2k} \in I_1$ and $\theta_{t+2k+1} \in 270 (1+\varepsilon, 1.162)$. 271 \end{enumerate} 272 \end{itemize} 273 \end{frame} 274 275 \begin{frame}{Quadratic Loss Stabilization} 276 \begin{itemize}[<+->] 277 \item For \textbf{1}: (There is a $t\ge 0: \theta_t \in I_1 \cup 278 I_2 \cup I_3)$ 279 \item Show that the function $h_\gamma(\theta) = \theta - 280 \gamma\theta(1-\theta^{2})$ for $\gamma \in (1, 1.25)$ stays 281 in $(0, 1.162)$. 282 283 \vspace{0.5cm} 284 285 \item For \textbf{2}: (For $\theta_t \in I_3$, then $\theta_{t+1} \in I_1 286 \cup I_2$.) 287 \item For $\theta \in (1, 1.162)$ note $h_\gamma(\theta)$ is linear in $\gamma$ when 288 $\theta>1$, decreasing as $\gamma$ increases then \newline 289 $0.652 = h_{1.25}(1.162) < h_\gamma(1.162) < h_\gamma(\theta) 290 < h_\gamma(1) = 1$ 291 292 \item So $\theta_{t+1} \in I_1 \cup I_2$. 293 294 295 \vspace{0.5cm} 296 297 \item \textbf{3} \& \textbf{4} are a bit more complicated but can be 298 shown also with basic analysis. 299 \end{itemize} 300 \end{frame} 301 302 303 \begin{frame}{SGD Dynamics} 304 \begin{itemize}[<+->] 305 \item To further understand loss stabilization -> assume perfect 306 stabilization 307 \item Conjecture: \textit{During loss stabilization, SGD is well 308 modeled by GD with constant label noise} 309 310 \item Label noise dynamics can be studies with Stochastic 311 Differential Equations (SDEs) 312 \end{itemize} 313 314 \end{frame} 315 316 \begin{frame}{SGD Dynamics Heuristics} 317 \begin{itemize}[<+->] 318 \item Heuristics: rewrite SGD iteration, where $V_t(\theta_t, i_t) = 319 \sqrt{\eta}\left(\nabla_\theta f(\theta_k) - \nabla 320 f_{i_t}(\theta_t) \right) $ is $p$-dimensional r.v. 321 \begin{center} 322 \begin{minipage}{0.5\textwidth} 323 \begin{align*} 324 \theta_{t+1} = \theta_t - \eta \nabla f(\theta_t) + \eta 325 V_t(\theta_t, i_t). 326 \end{align*} 327 \end{minipage} 328 \end{center} 329 330 \item Straight forward calculation shows 331 \begin{center} 332 \begin{minipage}{0.5\textwidth} 333 \begin{align*} 334 &\mathbb{E}(V_t|\theta_t) = 0, \\ 335 &\text{cov}(V_t, V_t|\theta_t) = \eta \Sigma(\theta_t), \\ 336 &\Sigma(\theta_k) := 337 \mathbb{E}\left(\frac{V_t^{2}}{\eta}\Big|\theta_t \right) 338 \end{align*} 339 \end{minipage} 340 \end{center} 341 \end{itemize} 342 \end{frame} 343 344 \begin{frame}{SGD Dynamics Heuristics} 345 \begin{itemize}[<+->] 346 \item Then consider time-homogeneous It\^o type SDE for $\tau\ge 0$ 347 \begin{center} 348 \begin{minipage}{0.5\textwidth} 349 \begin{align*} 350 d\theta_\tau = b(\theta_\tau)d\tau 351 +\sqrt{\eta}\sigma(\theta_\tau)dB_\tau, 352 \end{align*} 353 \end{minipage} 354 \end{center} 355 where $\theta_\tau \in \mathbb{R}^{p}$, $B_\tau$ standard p-dim. 356 Brownian 357 motion, $b:\mathbb{R}^{p} \to \mathbb{R}^{p}$ the drift and 358 $\sigma: \mathbb{R}^{p}\to \mathbb{R}^{p\times p}$ the diffusion 359 matrix 360 361 \item Apply Euler discretization with step size $\eta$ and 362 approximate $\theta_{\tau \eta}$ simply with $\hat{\theta}_{\tau}$ 363 \begin{center} 364 \begin{minipage}{0.5\textwidth} 365 \begin{align*} 366 \hat{\theta}_t= \hat{\theta}_t + \eta b(\theta_t) 367 +\eta \sigma(\hat{\theta}_t)Z_t, 368 \end{align*} 369 \end{minipage} 370 \end{center} 371 with $Z_t = B_{(\tau+1)\eta} - B_{\tau\eta}$ r.v. 372 373 \item Set $b = -\nabla_\theta f$ and $\sigma(\theta) = 374 \Sigma(\theta)^{\frac{1}{2}}$, then 375 \begin{center} 376 \begin{minipage}{0.5\textwidth} 377 \begin{align*} 378 d\theta_\tau = -\nabla f(\theta_\tau)d\tau 379 + (\eta\Sigma(\theta_\tau))^{\frac{1}{2}} dB_t. 380 \end{align*} 381 \end{minipage} 382 \end{center} 383 \end{itemize} 384 \end{frame} 385 386 \begin{frame}{SGD Dynamics Heuristics} 387 \begin{itemize}[<+->] 388 \item In our case: we have a loss function $\mathcal{L}$ 389 \item The noise at state $\theta$ is spanned by 390 $\{\nabla_\theta h_\theta(x_1), \ldots, \nabla_\theta 391 h(x_n)\} $ 392 \item Loss stabilization occurs at a level set $\delta > 0$ 393 \begin{center} 394 \begin{minipage}{0.5\textwidth} 395 \begin{align*} 396 d\theta_\tau = -\nabla_\theta \mathcal{L}(\theta_\tau)d\tau 397 + \sqrt{\eta\delta} 398 \phi_{\theta_\tau}\left(X\right)^{T}dB_\tau, 399 \end{align*} 400 \end{minipage} 401 \end{center} 402 \vspace{0.5cm} 403 where $\phi_\theta(X) := [\nabla_\theta 404 h_\theta(x_i)^{T}]_{i=1}^{n} \in \mathbb{R}^{n\times p}$ is 405 referred to as the Jacobian. 406 \end{itemize} 407 \end{frame} 408 409 \begin{frame}{Diagonal Linear Networks} 410 \begin{itemize}[<+->] 411 \item Two layer linear network with only diagonal connections: 412 \item Prediction function: 413 $h_{u, v}(x) = \langle u, v\odot x\rangle = \langle u \odot 414 v, x\rangle$. 415 \item Linear predictor $\beta: = u \odot v \in \mathbb{R}^{d}$ 416 \item Gradient $\nabla_u h_{u, v}(x) = v \odot x$. 417 \item SDE model (symmetric for v) 418 \begin{center} 419 \begin{minipage}{0.5\textwidth} 420 \begin{align*} 421 du_\tau = -\nabla_u \mathcal{L}(u_\tau, v_\tau) d\tau 422 +\sqrt{\eta \delta} v_\tau \odot [X^{T}dB_\tau]. 423 \end{align*} 424 \end{minipage} 425 \end{center} 426 427 \end{itemize} 428 \end{frame} 429 430 \begin{frame}{Diagonal Linear Networks: Conclusion} 431 \begin{itemize}[<+->] 432 \item During the first phase SGD with large $\eta$ 433 \begin{enumerate} 434 \item decreases the training loss 435 \item stabilizes at some level set $\delta$ 436 \end{enumerate} 437 \item Meanwhile there is an effective noise-driven dynamics 438 which oscillates on $\mathcal{O}(\sqrt{\eta\delta})$ 439 \item Decreasing the step size later leads to recovery of the 440 sparsest predictor. 441 \end{itemize} 442 \end{frame} 443 444 \begin{frame}{Sparsity Coefficient} 445 \begin{itemize}[<+->] 446 \item SDE dynamics of diagonal networks emphasizes that there is 447 a sparsity bias because of $v \odot [X^{T}dB_\tau]$ leading 448 to a coordinate shrinkage effect 449 \item Generally the same thing happens w.r.t the Jacobian 450 $\phi_\theta(X)$ leading to the same effect. 451 \begin{center} 452 \begin{minipage}{0.5\textwidth} 453 \begin{align*} 454 d\theta_\tau = -\nabla_\theta \mathcal{L}(\theta_\tau)d\tau 455 + \sqrt{\eta\delta} 456 \phi_{\theta_\tau}\left(X\right)^{T}dB_\tau, 457 \end{align*} 458 \end{minipage} 459 \end{center} 460 \item Conjecture: SDE noise part minimizes the $l_{2}$-norm of 461 the columns of $\phi_\theta(X)$. 462 463 \item Columns can be reduced to zero below a threshold. (fitting 464 part does not allow the Jacobian to collapse to zero) 465 \end{itemize} 466 \end{frame} 467 468 \begin{frame}{DLN Feature Sparsity Coefficient} 469 \begin{itemize}[<+->] 470 \item The Jacobian can be simplified during loss stabilization. 471 \item Motivation: count the average number of distinct, nonzero 472 activations over the training set = \textbf{feature sparsity 473 coefficient} 474 \end{itemize} 475 \end{frame} 476 477 \begin{frame}{DLN Results} 478 \begin{figure}[htpb] 479 \centering 480 \includegraphics[width=0.8\textwidth, 481 height=0.4\textheight]{./pics/dn_loss.png} 482 \includegraphics[width=0.8\textwidth, 483 height=0.4\textheight]{./pics/dn_sparsity.png} 484 \includegraphics[width=\textwidth]{./pics/dn_setup.png} 485 \caption{Diagonal Linear Nets Results} 486 \end{figure} 487 \end{frame} 488 489 \begin{frame}{ReLU networks} 490 \begin{itemize} 491 \item \textbf{Two Layer} ReLU Setup: 1D regression task with 12 points 492 \item SGD with $100$ neurons 493 \item Similar results are observed 494 \end{itemize} 495 496 \begin{itemize} 497 \item \textbf{Three Layer} ReLU Setup: teacher-student Setup 498 \item Two neurons on each layer. 499 \item Student network with $10$ neurons trained on $50$ samples 500 \item Warm up step size (increasing step size according to a 501 schedule) 502 \end{itemize} 503 \end{frame} 504 505 \begin{frame}{ReLU Results} 506 \begin{figure}[htpb] 507 \centering 508 \includegraphics[width=0.8\textwidth, 509 height=0.4\textheight]{./pics/relu2_loss.png} 510 \includegraphics[width=0.8\textwidth, 511 height=0.4\textheight]{./pics/relu2_sparsity.png} 512 \includegraphics[width=\textwidth]{./pics/relu2_setup.png} 513 \caption{\textbf{Two Layer} ReLU: 1D regression task} 514 \end{figure} 515 \end{frame} 516 517 \begin{frame}{ReLU Results} 518 \begin{figure}[htpb] 519 \centering 520 \includegraphics[width=0.8\textwidth, 521 height=0.4\textheight]{./pics/relu3_loss.png} 522 \includegraphics[width=\textwidth, 523 height=0.4\textheight]{./pics/relu3_sparsity.png} 524 \includegraphics[width=\textwidth]{./pics/relu3_setup.png} 525 \caption{\textbf{Three Layer} ReLU: teacher-student setup} 526 \end{figure} 527 \end{frame} 528 529 \begin{frame}{Deeper Networks: Results} 530 \begin{figure}[htpb] 531 \centering 532 \includegraphics[width=0.8\textwidth, 533 height=0.4\textheight]{./pics/densen_tiny_loss.png} 534 \includegraphics[width=0.8\textwidth, 535 height=0.4\textheight]{./pics/densen_tiny_sparsity.png} 536 \includegraphics[width=\textwidth]{./pics/densen_tiny_setup.png} 537 \caption{\textbf{DenseNet-100} trained on \textbf{ImageNet} 538 (Image classification Task)} 539 \end{figure} 540 \end{frame} 541 542 \begin{frame}{Bibliography} 543 \nocite{andriushchenko2023sgd} 544 \nocite{shalev2014understanding} 545 \nocite{li2018stochastic} 546 \nocite{pillaudvivien2022label} 547 \printbibliography 548 \end{frame} 549 \end{document}