notes

uni notes
git clone git://popovic.xyz/notes.git
Log | Files | Refs

pres1.tex (5046B)


      1 \documentclass[fleqn]{beamer}
      2 \beamertemplatenavigationsymbolsempty
      3 
      4 \usepackage[T1]{fontenc}
      5 \usepackage[utf8]{inputenc}
      6 
      7 \usepackage{amsmath,amssymb}
      8 \usepackage{graphicx}
      9 \usepackage{mathptmx}
     10 \usepackage{subcaption}
     11 \usepackage{amsthm}
     12 \usepackage{tikz}
     13 %\usepackage[colorlinks=true,naturalnames=true,plainpages=false,pdfpagelabels=true]{hyperref}
     14 \usetikzlibrary{patterns,decorations.pathmorphing,positioning, arrows, chains}
     15 
     16 \usepackage[backend=biber, sorting=none]{biblatex}
     17 \addbibresource{uni.bib}
     18 
     19 \setbeamertemplate{endpage}{%
     20     \begin{frame}
     21         \centering
     22         \Large \emph{To be continued\ldots}
     23 
     24         \vspace{1cm}
     25 
     26         \centering
     27         \Large \emph{Thank You!}
     28     \end{frame}
     29 }
     30 
     31 \AtEndDocument{\usebeamertemplate{endpage}}
     32 
     33 % vertical separator macro
     34 \newcommand{\vsep}{
     35   \column{0.0\textwidth}
     36     \begin{tikzpicture}
     37       \draw[very thick,black!10] (0,0) -- (0,7.3);
     38     \end{tikzpicture}
     39 }
     40 \setlength{\mathindent}{0pt}
     41 
     42 % Beamer theme
     43 \usetheme{UniVienna}
     44 \usefonttheme[onlysmall]{structurebold}
     45 \mode<presentation>
     46 \setbeamercovered{transparent=10}
     47 
     48 \title
     49 {SGD with Large Step Sizes Learns Sparse Features}
     50 \subtitle{Seminar Optimization}
     51 \author[Popović Milutin]
     52 {Popović Milutin\newline\newline Supervisor: Radu Ioan Bot}
     53 \date{31. October 2023}
     54 
     55 \begin{document}
     56     \begin{frame}
     57         \titlepage
     58     \end{frame}
     59 
     60     \begin{frame}{SGD (Stochastic gradient descent)}
     61         \begin{itemize}
     62             \item Objective is to minimize functions of the
     63                 form
     64                 \begin{align*}
     65                     \hspace{0.3\linewidth}
     66                     f(x) = \frac{1}{n} \sum_{i=1}^{n} f_i(x)
     67             \end{align*}
     68         \end{itemize}
     69     \end{frame}
     70 
     71     \begin{frame}{SGD (Stochastic gradient descent)}
     72         \begin{itemize}
     73             \item Training Data:
     74                 \begin{align*}
     75                     \hspace{0.3\linewidth}
     76                     \{(x_1, y_1),\ldots,(x_n,y_n)\} \in \mathbb{R}^{d} \times
     77                     \mathcal{Y}
     78             \end{align*}
     79         \item \mbox{}\onslide<2->{In large-scale ML: large dimension $d$\newline
     80             and large number of training data $n$.}
     81         \end{itemize}
     82     \end{frame}
     83 
     84     \begin{frame}{SGD (Stochastic gradient descent)}
     85         Classical examples of fitting the data via minimizing:
     86         \begin{itemize}
     87             \item \mbox{}\onslide<2->{Least Squares
     88                 \begin{align*}
     89                     \hspace{0.3\linewidth}
     90                     \frac{1}{n} \| Ax - b \|_2^2 =\frac{1}{n}\sum_{i=1}^n(a_i^T x - b_i)^2
     91             \end{align*}}
     92             \item \mbox{}\onslide<3->{Support Vector Machine (SVM):
     93                 \begin{align*}
     94                     \hspace{0.3\linewidth}
     95                     \frac{1}{2}||x||_2^2 + \frac{C}{n} \sum_i^n max(0, 1 - y_i(x^T a_i + b))
     96             \end{align*}}
     97         \item \mbox{}\onslide<4->{Deep Neural Nets
     98                 \begin{align*}
     99                     \hspace{0.3\linewidth}
    100                     \frac{1}{n} \sum_i^n \text{loss}(y_i, DNN(x; a_i))
    101                 \end{align*}}
    102 
    103         \end{itemize}
    104     \end{frame}
    105 
    106     \begin{frame}{SGD (Stochastic gradient descent)}
    107         \begin{itemize}
    108             \item Common pattern:
    109                 \begin{align*}
    110                     \hspace{0.3\linewidth}
    111                     f(x) = \frac{1}{n} \sum_{i=1}^{n} f_i(x)
    112                 \end{align*}
    113         \end{itemize}
    114     \end{frame}
    115 
    116     \begin{frame}{GD vs SGD}
    117         \begin{itemize}
    118             \item \mbox{}\onslide<2->{GD would compute the gradient of every $f_i(x)$ to update the
    119                 next iterate}
    120             \item \mbox{}\onslide<3->{SGD picks a pseudorandom $i(r) \in \{1,
    121                 2, \ldots, n\}$}
    122             \item \mbox{}\onslide<4->{then uses only $\nabla f_{i(r)}(x_k)$ as its descent
    123                 direction
    124                 \begin{align*}
    125                     \hspace{0.3\linewidth}
    126                     x^{k+1} = x^{k} - t_k \nabla f_{i(r)}(x^{k})
    127             \end{align*}}
    128 
    129         \item \mbox{}\onslide<5->{
    130             Key property : $\mathbb{E}[\nabla f_{i(r)}(x)] = \nabla f(x)$}
    131         \item  \mbox{}\onslide<5->{
    132             $\nabla f_{i(r)}(x)$ is an unbiased estimator !
    133             }
    134         \end{itemize}
    135     \end{frame}
    136 
    137     \begin{frame}{Large stepsizes induce Sparse Features}
    138         \begin{itemize}
    139             \item  \mbox{}\onslide<1->{large step sizes -> \textit{loss
    140                 stabilization}}
    141             \item \mbox{}\onslide<2->{the longer the larger step size is used
    142                 the better the sparse representation}
    143         \end{itemize}
    144 
    145         \begin{figure}[H]
    146             \centering
    147             \includegraphics[width=0.9\textwidth]{./pics/resnet18.png}
    148             \caption{ResNet-18 (Residual Network with 18 layers) trained on
    149             CIFAR-10 (60k 32x32 images)} \cite{andriushchenko2023sgd}
    150         \end{figure}
    151     \end{frame}
    152 
    153     \begin{frame}{Bibliography}
    154         \nocite{andriushchenko2023sgd}
    155         \printbibliography
    156     \end{frame}
    157 \end{document}