pres1.tex (5046B)
1 \documentclass[fleqn]{beamer} 2 \beamertemplatenavigationsymbolsempty 3 4 \usepackage[T1]{fontenc} 5 \usepackage[utf8]{inputenc} 6 7 \usepackage{amsmath,amssymb} 8 \usepackage{graphicx} 9 \usepackage{mathptmx} 10 \usepackage{subcaption} 11 \usepackage{amsthm} 12 \usepackage{tikz} 13 %\usepackage[colorlinks=true,naturalnames=true,plainpages=false,pdfpagelabels=true]{hyperref} 14 \usetikzlibrary{patterns,decorations.pathmorphing,positioning, arrows, chains} 15 16 \usepackage[backend=biber, sorting=none]{biblatex} 17 \addbibresource{uni.bib} 18 19 \setbeamertemplate{endpage}{% 20 \begin{frame} 21 \centering 22 \Large \emph{To be continued\ldots} 23 24 \vspace{1cm} 25 26 \centering 27 \Large \emph{Thank You!} 28 \end{frame} 29 } 30 31 \AtEndDocument{\usebeamertemplate{endpage}} 32 33 % vertical separator macro 34 \newcommand{\vsep}{ 35 \column{0.0\textwidth} 36 \begin{tikzpicture} 37 \draw[very thick,black!10] (0,0) -- (0,7.3); 38 \end{tikzpicture} 39 } 40 \setlength{\mathindent}{0pt} 41 42 % Beamer theme 43 \usetheme{UniVienna} 44 \usefonttheme[onlysmall]{structurebold} 45 \mode<presentation> 46 \setbeamercovered{transparent=10} 47 48 \title 49 {SGD with Large Step Sizes Learns Sparse Features} 50 \subtitle{Seminar Optimization} 51 \author[Popović Milutin] 52 {Popović Milutin\newline\newline Supervisor: Radu Ioan Bot} 53 \date{31. October 2023} 54 55 \begin{document} 56 \begin{frame} 57 \titlepage 58 \end{frame} 59 60 \begin{frame}{SGD (Stochastic gradient descent)} 61 \begin{itemize} 62 \item Objective is to minimize functions of the 63 form 64 \begin{align*} 65 \hspace{0.3\linewidth} 66 f(x) = \frac{1}{n} \sum_{i=1}^{n} f_i(x) 67 \end{align*} 68 \end{itemize} 69 \end{frame} 70 71 \begin{frame}{SGD (Stochastic gradient descent)} 72 \begin{itemize} 73 \item Training Data: 74 \begin{align*} 75 \hspace{0.3\linewidth} 76 \{(x_1, y_1),\ldots,(x_n,y_n)\} \in \mathbb{R}^{d} \times 77 \mathcal{Y} 78 \end{align*} 79 \item \mbox{}\onslide<2->{In large-scale ML: large dimension $d$\newline 80 and large number of training data $n$.} 81 \end{itemize} 82 \end{frame} 83 84 \begin{frame}{SGD (Stochastic gradient descent)} 85 Classical examples of fitting the data via minimizing: 86 \begin{itemize} 87 \item \mbox{}\onslide<2->{Least Squares 88 \begin{align*} 89 \hspace{0.3\linewidth} 90 \frac{1}{n} \| Ax - b \|_2^2 =\frac{1}{n}\sum_{i=1}^n(a_i^T x - b_i)^2 91 \end{align*}} 92 \item \mbox{}\onslide<3->{Support Vector Machine (SVM): 93 \begin{align*} 94 \hspace{0.3\linewidth} 95 \frac{1}{2}||x||_2^2 + \frac{C}{n} \sum_i^n max(0, 1 - y_i(x^T a_i + b)) 96 \end{align*}} 97 \item \mbox{}\onslide<4->{Deep Neural Nets 98 \begin{align*} 99 \hspace{0.3\linewidth} 100 \frac{1}{n} \sum_i^n \text{loss}(y_i, DNN(x; a_i)) 101 \end{align*}} 102 103 \end{itemize} 104 \end{frame} 105 106 \begin{frame}{SGD (Stochastic gradient descent)} 107 \begin{itemize} 108 \item Common pattern: 109 \begin{align*} 110 \hspace{0.3\linewidth} 111 f(x) = \frac{1}{n} \sum_{i=1}^{n} f_i(x) 112 \end{align*} 113 \end{itemize} 114 \end{frame} 115 116 \begin{frame}{GD vs SGD} 117 \begin{itemize} 118 \item \mbox{}\onslide<2->{GD would compute the gradient of every $f_i(x)$ to update the 119 next iterate} 120 \item \mbox{}\onslide<3->{SGD picks a pseudorandom $i(r) \in \{1, 121 2, \ldots, n\}$} 122 \item \mbox{}\onslide<4->{then uses only $\nabla f_{i(r)}(x_k)$ as its descent 123 direction 124 \begin{align*} 125 \hspace{0.3\linewidth} 126 x^{k+1} = x^{k} - t_k \nabla f_{i(r)}(x^{k}) 127 \end{align*}} 128 129 \item \mbox{}\onslide<5->{ 130 Key property : $\mathbb{E}[\nabla f_{i(r)}(x)] = \nabla f(x)$} 131 \item \mbox{}\onslide<5->{ 132 $\nabla f_{i(r)}(x)$ is an unbiased estimator ! 133 } 134 \end{itemize} 135 \end{frame} 136 137 \begin{frame}{Large stepsizes induce Sparse Features} 138 \begin{itemize} 139 \item \mbox{}\onslide<1->{large step sizes -> \textit{loss 140 stabilization}} 141 \item \mbox{}\onslide<2->{the longer the larger step size is used 142 the better the sparse representation} 143 \end{itemize} 144 145 \begin{figure}[H] 146 \centering 147 \includegraphics[width=0.9\textwidth]{./pics/resnet18.png} 148 \caption{ResNet-18 (Residual Network with 18 layers) trained on 149 CIFAR-10 (60k 32x32 images)} \cite{andriushchenko2023sgd} 150 \end{figure} 151 \end{frame} 152 153 \begin{frame}{Bibliography} 154 \nocite{andriushchenko2023sgd} 155 \printbibliography 156 \end{frame} 157 \end{document}