notes

uni notes
git clone git://popovic.xyz/notes.git
Log | Files | Refs

pres.tex (21101B)


      1 \documentclass[fleqn]{beamer}
      2 \beamertemplatenavigationsymbolsempty
      3 
      4 \usepackage[T1]{fontenc}
      5 \usepackage[utf8]{inputenc}
      6 
      7 \usepackage{amsmath,amssymb}
      8 \usepackage{graphicx}
      9 \usepackage{mathptmx}
     10 \usepackage{subcaption}
     11 \usepackage{amsthm}
     12 \usepackage{tikz}
     13 %\usepackage[colorlinks=true,naturalnames=true,plainpages=false,pdfpagelabels=true]{hyperref}
     14 \usetikzlibrary{patterns,decorations.pathmorphing,positioning, arrows, chains}
     15 
     16 \usepackage[backend=biber, sorting=none]{biblatex}
     17 \addbibresource{cite.bib}
     18 
     19 \setbeamertemplate{endpage}{%
     20     \begin{frame}
     21         \centering
     22         \Large \emph{Thank You!}
     23     \end{frame}
     24 }
     25 
     26 \AtEndDocument{\usebeamertemplate{endpage}}
     27 
     28 % vertical separator macro
     29 \newcommand{\vsep}{
     30   \column{0.0\textwidth}
     31     \begin{tikzpicture}
     32       \draw[very thick,black!10] (0,0) -- (0,7.3);
     33     \end{tikzpicture}
     34 }
     35 \setlength{\mathindent}{0pt}
     36 
     37 % Beamer theme
     38 \usetheme{UniVienna}
     39 \usefonttheme[onlysmall]{structurebold}
     40 \mode<presentation>
     41 \setbeamercovered{transparent=10}
     42 
     43 \title
     44 {SGD with Large Step Sizes Learns Sparse Features}
     45 \subtitle{Seminar Optimization}
     46 \author[Popović Milutin]
     47 {Popović Milutin\newline\newline Supervisor: Radu Ioan Bot}
     48 \date{23. January 2024}
     49 
     50 \begin{document}
     51     \begin{frame}
     52         \maketitle
     53     \end{frame}
     54 
     55     \begin{frame}{Introduction SGD}
     56         \begin{itemize}[<+->]
     57         \item[] Given:
     58             \begin{itemize}
     59                 \item input/output samples $(x_i, y_i)_{i=1}^{n} \in
     60                     \mathbb{R}^{d} \times \mathbb{R}$
     61                 \item Prediction functions
     62                     $\mathcal{H} = \{x \mapsto h_{\theta}(x)\; | \; \theta \in
     63                     \mathbb{R}^{p}\}$
     64                 \item setting $p \gg n$
     65             \end{itemize}
     66         \item[] Minimize the loss function
     67         \begin{center}
     68             \begin{minipage}{0.5\textwidth}
     69                 \begin{align*}
     70                     \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1}^{n} \left(
     71                     h_\theta(x_i) - y_i \right)^{2},
     72                 \end{align*}
     73             \end{minipage}
     74         \end{center}
     75         \begin{itemize}
     76             \item using Stochastic Gradient Descend (SGD).
     77         \end{itemize}
     78 
     79     \end{itemize}
     80     \end{frame}
     81 
     82     \begin{frame}{Introduction SGD}
     83 
     84         \begin{itemize}[<+->]
     85             \item Instead of computing the gradient of each function in the
     86                 sum \item consider $i \sim U(\{1,\ldots,n\} )$ uniformly distributed with
     87             \item SGD recursion for learning rate $\eta > 0$, initial $\theta_0
     88                 \in \mathbb{R}^{p}$, $\forall t \in \mathbb{N}$
     89         \begin{center}
     90             \begin{minipage}{0.5\textwidth}
     91                 \begin{align*}
     92                     \theta_{t+1} = \theta_t - \eta\left(h_{\theta_t}(x_{i_t})
     93                     - y_{i_t}\right)
     94                     \nabla_{\theta} h_{\theta_t}(x_{i_t}).
     95                 \end{align*}
     96             \end{minipage}
     97         \end{center}
     98         \end{itemize}
     99 
    100     \end{frame}
    101 
    102 
    103     \begin{frame}{GD with specific Label Noise}
    104         \begin{itemize}[<+->]
    105             \item Define a random vector $\xi_t \in \mathbb{R}^{n}$ in each iteration
    106                 $t \ge 0$
    107                 \begin{center}
    108                 \begin{minipage}{0.5\textwidth}
    109                     \begin{align*}
    110                         [\xi_t]_i := (h_{\theta_t}(x_i) -
    111                         y_i)(1-n\mathbf{1}_{i=i_t})
    112                     \end{align*}
    113                 \end{minipage}
    114                 \end{center}
    115                 \vspace{0.5cm}
    116             \item Then $(\theta_t)_{t\ge0}$ follows the full-batch gradient
    117                 dynamics on $\mathcal{L}$
    118                 \begin{center}
    119                 \begin{minipage}{0.5\textwidth}
    120                 \begin{align*}
    121                     \theta_{t+1} = \theta_t - \frac{\eta}{n} \sum_{i=1}^{n}
    122                     \left( h_{\theta_t}(x_i) - y_i^{t} \right)
    123                     \nabla_{\theta} h_{\theta_t}(x_i),
    124                 \end{align*}
    125                 \end{minipage}
    126                 \end{center}
    127                 where $y^{t}:= y + \xi_t$.
    128             \item Note: $\xi_t$ is mean zero r.v.
    129             \item Note: Variance of $\xi_t$ is
    130                 $\frac{1}{n(n-1)}\mathbb{E}\|\xi_t\|^{2} = 2
    131                 \mathcal{L}(\theta_t)$.
    132         \end{itemize}
    133     \end{frame}
    134 
    135     \begin{frame}{GD with specific Label Noise}
    136         \begin{center}
    137         \begin{minipage}{0.5\textwidth}
    138             \begin{align*}
    139                 \theta_{t+1} = \theta_t - \frac{\eta}{n} \sum_{i=1}^{n}
    140                 \left( h_{\theta_t}(x_i) - y_i^{t} \right)
    141                 \nabla_{\theta} h_{\theta_t}(x_i),
    142             \end{align*}
    143         \end{minipage}
    144         \end{center}
    145         \begin{itemize}
    146             \item The noisy part at state $\theta$ belongs to the space
    147                 spanned by $\{\nabla_\theta
    148                 h_\theta(x_1),\ldots,\nabla_\theta h_\theta(x_n)\} $
    149             \item During loss stabilization of large step sizes: may lead to
    150                 a constant effective scale of label noise.
    151         \end{itemize}
    152     \end{frame}
    153 
    154     \begin{frame}{Quadratic Loss Stabilization}
    155         \begin{itemize}[<+->]
    156             \item Non-convex toy model:
    157             \item One-dimensional data $x_i$ from distribution $\hat{\rho}$
    158             \item linear model outputs $y_i = x_i \theta_*^{2}$
    159             \item Quadratic loss $F(\theta) :=
    160                 \frac{1}{4}\mathbb{E}_{\hat{\rho}}(y-x\theta^{2})^{2}$ with
    161                 SGD recursion for $\eta > 0$, $t \in \mathbb{N}$:
    162         \begin{center}
    163         \begin{minipage}{0.5\textwidth}
    164             \begin{align*}
    165                 \theta_{t+1} = \theta_t + \eta \theta_t x_{i_t}(y_{i_t} -
    166                 x_{i_t}\theta^{2}_t)
    167             \end{align*}
    168         \end{minipage}
    169         \end{center}
    170 
    171 
    172         \end{itemize}
    173 
    174     \end{frame}
    175 
    176     \begin{frame}{Quadratic Loss Stabilization}
    177         \begin{block}{\textbf{Proposition:} Loss Stabilization}
    178             Assume $\exists x_{\text{min}}, x_{\text{max}} > 0$ s.t.
    179             $\text{supp}(\hat{\rho}) \subset [x_\text{min}, x_\text{max}]$.
    180             Then for any $\eta \in \left( (\theta_* x_\text{min})^{-2}, 1.25
    181             (\theta_*x_\text{max})^{-2}\right)$, any initial $\theta_{0} \in
    182             (0, \theta_*)$, for all $t \in \mathbb{N}$ we have that
    183             \textbf{almost surely}
    184             \begin{center}
    185             \begin{minipage}{0.5\textwidth}
    186             \begin{align*}
    187                 F(\theta_t) \in \left(\varepsilon_0 \theta_*^{2},\; 0.17
    188                 \theta_*^{2}\right),
    189             \end{align*}
    190             \end{minipage}
    191             \end{center}
    192         where $\varepsilon_0 := \min
    193         \{\frac{\eta(\theta_*x_\text{min})^{2}-1}{3},\; 0.02\}$.
    194         \newline
    195         And \textbf{almost surely} there exists $t, k >0$ s.t.
    196         \begin{center}
    197         \begin{minipage}{0.5\textwidth}
    198         \begin{align*}
    199             &\theta_{t+2k} \in (0.65\theta_*, (1-\varepsilon_0)\theta_*) \quad
    200             \text{and}\\
    201             &\theta_{t+2k+1} \in ((1-\varepsilon_0)\theta_*, 1.162\theta_*)
    202         \end{align*}
    203         \end{minipage}
    204         \end{center}
    205         \end{block}
    206     \end{frame}
    207 
    208     \begin{frame}{Quadratic Loss Stabilization}
    209         \begin{itemize}[<+->]
    210             \item For clarity rescale $\theta_t \to \theta_t/\theta_*$
    211             \begin{center}
    212             \begin{minipage}{0.5\textwidth}
    213                 \begin{align*}
    214                     \theta_{t+1} = \theta_t + \gamma\theta_t\left(
    215                     1-\theta_t^{2} \right),
    216                 \end{align*}
    217             \end{minipage}
    218             \end{center}
    219             where $\gamma \sim \hat{\rho}_{\gamma}$ is the pushforward
    220             $\hat{\rho}$ under $z \mapsto \eta\theta_*^{2}z^2$.
    221 
    222         \item then the interval of $\eta$ becomes that of
    223             $\text{supp}(\hat{\rho}_{\gamma}) \subseteq (1, 1.25)$.
    224         \end{itemize}
    225     \end{frame}
    226 
    227     \begin{frame}{Quadratic Loss Stabilization}
    228         \begin{itemize}[<+->]
    229             \item To prove the proposition divide $(0, 1.162)$ into 4
    230                 regions
    231             \item $I_0=(0, 0.65]$, $I_1=(0.65, 1-\varepsilon)$,
    232                 $I_2=[1-\varepsilon, 1)$ and $I_3 = [1, 1.162)$.
    233 
    234             \item All iterates end up in $I_1$, leave and come back to $I_1$
    235                 in 2 steps.
    236 
    237             \item divided into 4 steps
    238                 \begin{enumerate}
    239                     \item There is a $t\ge 0$: $\theta_t \in I_1\cup
    240                         I_2\cup I_3$
    241                     \item For $\theta_t \in I_3$, then $\theta_{t+1} \in I_1
    242                         \cup I_2$.
    243                     \item For $\theta_t \in I_2$, then there is a $k>0$:
    244                     $\forall k' < k$, it holds that $\theta_{t+2k'} \in
    245                         I_2$ and $\theta_{t+2k} \in I_1$
    246                     \item For $\theta_t \in I_1$, then $\forall k \ge 0$, it
    247                         holds that
    248                         $\theta_{t+2k} \in I_1$ and $\theta_{t+2k+1} \in
    249                         (1+\varepsilon, 1.162)$.
    250                 \end{enumerate}
    251 
    252 
    253         \end{itemize}
    254     \end{frame}
    255 
    256     \begin{frame}{Quadratic Loss Stabilization}
    257         \begin{itemize}[<+->]
    258             \item Proof is divided into 4 steps show
    259                 \begin{enumerate}
    260                     \item There is a $t\ge 0$: $\theta_t \in I_1\cup
    261                         I_2\cup I_3$
    262                     \item For $\theta_t \in I_3$, then $\theta_{t+1} \in I_1
    263                         \cup I_2$.
    264                     \item For $\theta_t \in I_2$, then there is a $k>0$:
    265                         $\forall k' < k$, it holds that $\theta{t+2k'} \in
    266                         I_2$ and $\theta_{t+2k} \in I_1$
    267                     \item For $\theta_t \in I_1$, then $\forall k \ge 0$, it
    268                         holds that
    269                         $\theta_{t+2k} \in I_1$ and $\theta_{t+2k+1} \in
    270                         (1+\varepsilon, 1.162)$.
    271                 \end{enumerate}
    272         \end{itemize}
    273     \end{frame}
    274 
    275     \begin{frame}{Quadratic Loss Stabilization}
    276         \begin{itemize}[<+->]
    277             \item For \textbf{1}: (There is a $t\ge 0: \theta_t \in I_1 \cup
    278                 I_2 \cup I_3)$
    279             \item Show that the function $h_\gamma(\theta) = \theta -
    280                 \gamma\theta(1-\theta^{2})$ for $\gamma \in (1, 1.25)$ stays
    281                 in $(0, 1.162)$.
    282 
    283         \vspace{0.5cm}
    284 
    285             \item For \textbf{2}: (For $\theta_t \in I_3$, then $\theta_{t+1} \in I_1
    286                         \cup I_2$.)
    287             \item For $\theta \in (1, 1.162)$ note $h_\gamma(\theta)$ is linear in $\gamma$ when
    288                 $\theta>1$, decreasing as $\gamma$ increases then \newline
    289                 $0.652 = h_{1.25}(1.162) < h_\gamma(1.162) < h_\gamma(\theta)
    290                     < h_\gamma(1) = 1$
    291 
    292             \item So $\theta_{t+1} \in I_1 \cup I_2$.
    293 
    294 
    295         \vspace{0.5cm}
    296 
    297             \item \textbf{3} \& \textbf{4} are a bit more complicated but can be
    298                 shown also with basic analysis.
    299         \end{itemize}
    300     \end{frame}
    301 
    302 
    303     \begin{frame}{SGD Dynamics}
    304         \begin{itemize}[<+->]
    305             \item To further understand loss stabilization -> assume perfect
    306                 stabilization
    307             \item Conjecture: \textit{During loss stabilization, SGD is well
    308                 modeled by GD with constant label noise}
    309 
    310             \item Label noise dynamics can be studies with Stochastic
    311                 Differential Equations (SDEs)
    312         \end{itemize}
    313 
    314     \end{frame}
    315 
    316     \begin{frame}{SGD Dynamics Heuristics}
    317         \begin{itemize}[<+->]
    318             \item Heuristics: rewrite SGD iteration, where $V_t(\theta_t, i_t) =
    319                 \sqrt{\eta}\left(\nabla_\theta f(\theta_k) - \nabla
    320                 f_{i_t}(\theta_t)  \right)  $ is $p$-dimensional r.v.
    321             \begin{center}
    322             \begin{minipage}{0.5\textwidth}
    323                 \begin{align*}
    324                     \theta_{t+1} = \theta_t - \eta \nabla f(\theta_t) + \eta
    325                     V_t(\theta_t, i_t).
    326                 \end{align*}
    327             \end{minipage}
    328             \end{center}
    329 
    330         \item Straight forward calculation shows
    331             \begin{center}
    332             \begin{minipage}{0.5\textwidth}
    333                 \begin{align*}
    334                     &\mathbb{E}(V_t|\theta_t) = 0, \\
    335                     &\text{cov}(V_t, V_t|\theta_t) = \eta \Sigma(\theta_t), \\
    336                     &\Sigma(\theta_k) :=
    337                     \mathbb{E}\left(\frac{V_t^{2}}{\eta}\Big|\theta_t \right)
    338                 \end{align*}
    339             \end{minipage}
    340             \end{center}
    341         \end{itemize}
    342     \end{frame}
    343 
    344     \begin{frame}{SGD Dynamics Heuristics}
    345         \begin{itemize}[<+->]
    346             \item Then consider  time-homogeneous It\^o type SDE for $\tau\ge 0$
    347             \begin{center}
    348             \begin{minipage}{0.5\textwidth}
    349                 \begin{align*}
    350                     d\theta_\tau = b(\theta_\tau)d\tau
    351                     +\sqrt{\eta}\sigma(\theta_\tau)dB_\tau,
    352                 \end{align*}
    353             \end{minipage}
    354             \end{center}
    355             where $\theta_\tau \in \mathbb{R}^{p}$, $B_\tau$ standard p-dim.
    356             Brownian
    357             motion, $b:\mathbb{R}^{p} \to \mathbb{R}^{p}$ the drift and
    358             $\sigma: \mathbb{R}^{p}\to \mathbb{R}^{p\times p}$ the diffusion
    359             matrix
    360 
    361         \item Apply Euler discretization with step size $\eta$ and
    362             approximate $\theta_{\tau \eta}$ simply with $\hat{\theta}_{\tau}$
    363             \begin{center}
    364             \begin{minipage}{0.5\textwidth}
    365                 \begin{align*}
    366                     \hat{\theta}_t= \hat{\theta}_t + \eta b(\theta_t)
    367                     +\eta \sigma(\hat{\theta}_t)Z_t,
    368                 \end{align*}
    369             \end{minipage}
    370             \end{center}
    371             with $Z_t = B_{(\tau+1)\eta} - B_{\tau\eta}$ r.v.
    372 
    373         \item Set $b = -\nabla_\theta f$ and $\sigma(\theta) =
    374             \Sigma(\theta)^{\frac{1}{2}}$, then
    375             \begin{center}
    376             \begin{minipage}{0.5\textwidth}
    377                 \begin{align*}
    378                     d\theta_\tau = -\nabla f(\theta_\tau)d\tau
    379                     + (\eta\Sigma(\theta_\tau))^{\frac{1}{2}} dB_t.
    380                 \end{align*}
    381             \end{minipage}
    382             \end{center}
    383         \end{itemize}
    384     \end{frame}
    385 
    386     \begin{frame}{SGD Dynamics Heuristics}
    387         \begin{itemize}[<+->]
    388             \item In our case: we have a loss function $\mathcal{L}$
    389             \item The noise at state $\theta$ is spanned by
    390                 $\{\nabla_\theta h_\theta(x_1), \ldots, \nabla_\theta
    391                 h(x_n)\} $
    392             \item Loss stabilization occurs at a level set $\delta > 0$
    393             \begin{center}
    394             \begin{minipage}{0.5\textwidth}
    395                 \begin{align*}
    396                     d\theta_\tau = -\nabla_\theta \mathcal{L}(\theta_\tau)d\tau
    397                     + \sqrt{\eta\delta}
    398                     \phi_{\theta_\tau}\left(X\right)^{T}dB_\tau,
    399                 \end{align*}
    400             \end{minipage}
    401             \end{center}
    402             \vspace{0.5cm}
    403             where $\phi_\theta(X) := [\nabla_\theta
    404             h_\theta(x_i)^{T}]_{i=1}^{n} \in \mathbb{R}^{n\times p}$ is
    405             referred to as the Jacobian.
    406         \end{itemize}
    407     \end{frame}
    408 
    409     \begin{frame}{Diagonal Linear Networks}
    410         \begin{itemize}[<+->]
    411             \item Two layer linear network with only diagonal connections:
    412             \item Prediction function:
    413                 $h_{u, v}(x) = \langle u, v\odot x\rangle =  \langle u \odot
    414                 v, x\rangle$.
    415             \item Linear predictor $\beta: = u \odot v \in \mathbb{R}^{d}$
    416             \item Gradient $\nabla_u h_{u, v}(x) = v \odot x$.
    417             \item SDE model (symmetric for v)
    418             \begin{center}
    419             \begin{minipage}{0.5\textwidth}
    420                 \begin{align*}
    421                     du_\tau = -\nabla_u \mathcal{L}(u_\tau, v_\tau) d\tau
    422                     +\sqrt{\eta \delta} v_\tau \odot [X^{T}dB_\tau].
    423                 \end{align*}
    424             \end{minipage}
    425             \end{center}
    426 
    427         \end{itemize}
    428     \end{frame}
    429 
    430     \begin{frame}{Diagonal Linear Networks: Conclusion}
    431         \begin{itemize}[<+->]
    432             \item During the first phase SGD with large $\eta$
    433                 \begin{enumerate}
    434                     \item decreases the training loss
    435                     \item stabilizes at some level set $\delta$
    436                 \end{enumerate}
    437             \item Meanwhile there is an effective noise-driven dynamics
    438                 which oscillates on $\mathcal{O}(\sqrt{\eta\delta})$
    439             \item Decreasing the step size later leads to recovery of the
    440                 sparsest predictor.
    441         \end{itemize}
    442     \end{frame}
    443 
    444     \begin{frame}{Sparsity Coefficient}
    445         \begin{itemize}[<+->]
    446              \item SDE dynamics of diagonal networks emphasizes that there is
    447                  a sparsity bias because of $v \odot [X^{T}dB_\tau]$ leading
    448                  to a coordinate shrinkage effect
    449             \item Generally the same thing happens w.r.t the Jacobian
    450                 $\phi_\theta(X)$ leading to the same effect.
    451             \begin{center}
    452             \begin{minipage}{0.5\textwidth}
    453                 \begin{align*}
    454                     d\theta_\tau = -\nabla_\theta \mathcal{L}(\theta_\tau)d\tau
    455                     + \sqrt{\eta\delta}
    456                     \phi_{\theta_\tau}\left(X\right)^{T}dB_\tau,
    457                 \end{align*}
    458             \end{minipage}
    459             \end{center}
    460             \item Conjecture: SDE noise part minimizes the $l_{2}$-norm of
    461                 the columns of $\phi_\theta(X)$.
    462 
    463             \item Columns can be reduced to zero below a threshold. (fitting
    464                 part does not allow the Jacobian to collapse to zero)
    465          \end{itemize}
    466     \end{frame}
    467 
    468     \begin{frame}{DLN Feature Sparsity Coefficient}
    469         \begin{itemize}[<+->]
    470             \item The Jacobian can be simplified during loss stabilization.
    471             \item Motivation: count the average number of distinct, nonzero
    472                 activations over the training set = \textbf{feature sparsity
    473                 coefficient}
    474         \end{itemize}
    475     \end{frame}
    476 
    477     \begin{frame}{DLN Results}
    478         \begin{figure}[htpb]
    479             \centering
    480             \includegraphics[width=0.8\textwidth,
    481             height=0.4\textheight]{./pics/dn_loss.png}
    482             \includegraphics[width=0.8\textwidth,
    483             height=0.4\textheight]{./pics/dn_sparsity.png}
    484             \includegraphics[width=\textwidth]{./pics/dn_setup.png}
    485             \caption{Diagonal Linear Nets Results}
    486         \end{figure}
    487     \end{frame}
    488 
    489     \begin{frame}{ReLU networks}
    490         \begin{itemize}
    491             \item \textbf{Two Layer} ReLU Setup: 1D regression task with 12 points
    492             \item SGD with $100$ neurons
    493             \item Similar results are observed
    494         \end{itemize}
    495 
    496         \begin{itemize}
    497             \item \textbf{Three Layer} ReLU Setup: teacher-student Setup
    498             \item Two neurons on each layer.
    499             \item Student network with $10$ neurons trained on $50$ samples
    500             \item Warm up step size (increasing step size according to a
    501                 schedule)
    502         \end{itemize}
    503     \end{frame}
    504 
    505     \begin{frame}{ReLU Results}
    506         \begin{figure}[htpb]
    507             \centering
    508             \includegraphics[width=0.8\textwidth,
    509             height=0.4\textheight]{./pics/relu2_loss.png}
    510             \includegraphics[width=0.8\textwidth,
    511             height=0.4\textheight]{./pics/relu2_sparsity.png}
    512             \includegraphics[width=\textwidth]{./pics/relu2_setup.png}
    513             \caption{\textbf{Two Layer} ReLU: 1D regression task}
    514         \end{figure}
    515     \end{frame}
    516 
    517     \begin{frame}{ReLU Results}
    518         \begin{figure}[htpb]
    519             \centering
    520             \includegraphics[width=0.8\textwidth,
    521             height=0.4\textheight]{./pics/relu3_loss.png}
    522             \includegraphics[width=\textwidth,
    523             height=0.4\textheight]{./pics/relu3_sparsity.png}
    524             \includegraphics[width=\textwidth]{./pics/relu3_setup.png}
    525             \caption{\textbf{Three Layer} ReLU: teacher-student setup}
    526         \end{figure}
    527     \end{frame}
    528 
    529    \begin{frame}{Deeper Networks: Results}
    530         \begin{figure}[htpb]
    531             \centering
    532             \includegraphics[width=0.8\textwidth,
    533             height=0.4\textheight]{./pics/densen_tiny_loss.png}
    534             \includegraphics[width=0.8\textwidth,
    535             height=0.4\textheight]{./pics/densen_tiny_sparsity.png}
    536             \includegraphics[width=\textwidth]{./pics/densen_tiny_setup.png}
    537             \caption{\textbf{DenseNet-100} trained on \textbf{ImageNet}
    538             (Image classification Task)}
    539         \end{figure}
    540    \end{frame}
    541 
    542 \begin{frame}{Bibliography}
    543         \nocite{andriushchenko2023sgd}
    544         \nocite{shalev2014understanding}
    545         \nocite{li2018stochastic}
    546         \nocite{pillaudvivien2022label}
    547         \printbibliography
    548     \end{frame}
    549 \end{document}