commit b226f751a4b9fadd7e6bc446ae8099f6ddaefae1
parent d15be9da1c4ba20b46373578c42a8d1a2bdca271
Author: miksa234 <milutin@popovic.xyz>
Date: Mon, 25 Dec 2023 18:02:55 +0000
start writing
Diffstat:
9 files changed, 506 insertions(+), 0 deletions(-)
diff --git a/opt_sem/andriushchenko23b.pdf b/opt_sem/refs/andriushchenko23b.pdf
Binary files differ.
diff --git a/opt_sem/refs/sgd_conv.pdf b/opt_sem/refs/sgd_conv.pdf
Binary files differ.
diff --git a/opt_sem/refs/sgd_sde.pdf b/opt_sem/refs/sgd_sde.pdf
Binary files differ.
diff --git a/opt_sem/refs/sgd_sss.pdf b/opt_sem/refs/sgd_sss.pdf
Binary files differ.
diff --git a/opt_sem/understanding_machine_learning.pdf b/opt_sem/refs/understanding_machine_learning.pdf
Binary files differ.
diff --git a/opt_sem/summary/cite.bib b/opt_sem/summary/cite.bib
@@ -0,0 +1,28 @@
+@misc{andriushchenko2023sgd,
+ title={SGD with Large Step Sizes Learns Sparse Features},
+ author={Maksym Andriushchenko and Aditya Varre and Loucas Pillaud-Vivien and Nicolas Flammarion},
+ year={2023},
+ eprint={2210.05337},
+ archivePrefix={arXiv},
+ primaryClass={cs.LG}
+}
+
+@article{fast_armijo_2022,
+ author = {Hafshejani, Sajad and Gaur, Daya and Hossain, Shahadat and Benkoczi, Robert},
+ year = {2022},
+ month = {11},
+ pages = {},
+ title = {Fast Armijo line search for stochastic gradient descent},
+ doi = {10.21203/rs.3.rs-2285238/v1}
+}
+
+@book{shalev2014understanding,
+ title={Understanding Machine Learning: From Theory to Algorithms},
+ author={Shalev-Shwartz, S. and Ben-David, S.},
+ isbn={9781107057135},
+ lccn={2014001779},
+ series={Understanding Machine Learning: From Theory to Algorithms},
+ url={https://books.google.pt/books?id=ttJkAwAAQBAJ},
+ year={2014},
+ publisher={Cambridge University Press}
+}
diff --git a/opt_sem/summary/main.tex b/opt_sem/summary/main.tex
@@ -0,0 +1,187 @@
+\include{./preamble.tex}
+
+
+\begin{document}
+
+\maketitle
+
+\tableofcontents
+
+\section{Introduction}
+Large step sizes may lead the loss to stabilize by making SGD bounce above a
+valley. Showcase is done with mean square error. Consider a family of
+prediction functions $\mathcal{H} := \{x \to h_\theta(x), \theta \in
+\mathbb{R}^{p}\}$. The training loss wrt. input/output samples $(x_i,
+y_i)_{i=1}^{n} \in \mathbb{R}^{d}\times\mathbb{R}$ is
+\begin{align}
+ \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1} \left( h_\theta(x_i) -
+ y_i \right)^{2}.
+\end{align}
+The setting $p \gg n$, i.e. the overparametrized setting is considered where
+there exist many parameters $\theta^{*}$ that lead to zero loss or perfectly
+interpolated the dataset. To find the minimizers of the risk function we
+consider a SGD recursion with step size $\eta > 0$, with initial $\theta_0
+\in \mathbb{R}^{p}$, for all $t \in \mathbb{N}$
+\begin{align}
+ \theta_{t+1} = \theta_t - \eta\left(h_{\theta_t}(x_{i_t})
+ - y_{i_t}\right)
+ \nabla_{\theta} h_{\theta_t}(x_{i_t}), \label{eq: sgd_it}
+\end{align}
+where $i_t \sim U(\{1,\ldots,n\})$, is a random variable following the
+discrete uniform distribution over a sample of indices.<
+\subsection{GD and SGD Relation}
+The authors highlight the importance of gradient and noise, by explaining the
+connection between the SGD dynamics and full batch GD plus a specific label
+noise.
+
+\begin{proposition}
+ Let $(\theta_t)_{t\ge 0 }$ follow the SGD dynamics of \ref{eq: sgd_it},
+ with the random sampling function $(i_t)_{t\ge_0}$. For $t\ge 0$ define
+ the random vector $\xi_t \in \mathbb{R}^{n}$ s.t.:
+ \begin{align}
+ [\xi_t]_i := (h_{\theta_t}(x_i) - y_i)(1-n\mathbf{1}_{i=i_t}).
+ \end{align}
+ For $i \in \{1,\ldots,n\}$ and $\mathbf{1}_{A}$ is the indicator function
+ of event $A$. Then $(\theta_t)_{\theta\ge )}$ follows the full batch
+ gradient descent dynamics on $\mathcal{L}$ with label noise
+ $(\xi_t)_{t\ge 0}$, i.e.
+ \begin{align}
+ \theta_{t+1} = \theta_t - \frac{\eta}{n} \sum_{i=1}^{n}
+ \left( h_{\theta_t}(x_i) - y_i^{t} \right)
+ \nabla_{\theta}h_{\theta_t}(x_i),
+ \end{align}
+ where we define the random labels $y^{t} := y + \xi_t$. Furthermore
+ $\xi_t$ is a mean zero random vector with variance such that
+ $\frac{1}{n(n-1)}\mathbb{E}\|\xi_t\|^{2} = 2 \mathcal{L}(\theta_t)$.
+\end{proposition}
+Two important things are are shown in the above Proposition.
+\begin{enumerate}
+ \item The noisy part at state $\theta$ always belongs to the linear space
+ spanned by $\{\nabla_\theta h_\theta(x_1),\ldots, \nabla_\theta
+ h_\theta(x_n)\}$.
+
+ \item The loss can stabilize because of large step sizes, this may lead
+ to constant effective scale of label noise, which is clearly
+ explained thorough out the paper summary \cite{andriushchenko2023sgd}.
+\end{enumerate}
+\section{Dynamics behind Loss Stabilization}
+For the generic quadratic loss $F(\beta) := \|X\beta - y\|^{2}$, gradient
+descent converges with step size $\eta < \frac{2}{\lambda_{\text{max}}}$,
+diverges for $\eta > \frac{2}{\lambda_{\text{max}}}$ and converges to a
+bouncing 2-periodic dynamics for $\eta = \frac{2}{\lambda_{\text{max}}}$,
+where $\lambda_{\text{max}}$ is the largest eigenvalue of the Hessian. On the
+other for hand nonquadratic loss there exists an open interval of the step
+sizes for witch the GD algorithm neither converges nor diverges \cite{andriushchenko2023sgd}.
+Complementing this with an example were the loss stabilization occurs almost
+surely in the case of SGD. A regression example with quadratic
+parametrization on the one dimensional data inputs $x_i$ from a distribution
+$\hat{\rho}$ and inputs generated by a linear model $y_i =
+x_i\theta_{*}^{2}$. With the loss $F(\theta) := \frac{1}{4}
+\mathbb{E}_{\hat{\rho}}(y - x\theta^{2})^{2}$, the std iterates with step
+size $\eta>0$ follow for $ t \in \mathbb{N}$
+\begin{align}
+ \theta_{t+1} + \theta_t + \eta \theta_t x_{i_t}(y_{i_t} -
+ x_{i_t}\theta_\text{t}^{2}).
+\end{align}
+W.l.o.g., consider $\theta_* = 1 $ and $\text{supp}(\hat{\rho})=[a,b] \subset
+\mathbb{R}$. Then the following proposition holds
+
+\begin{proposition}
+ \label{prop: loss-stab}
+ For any $\eta \in (a^{-2}, 1.25b^{-2})$ and initialization $\theta_0 \in
+ (0, 1)$ for all $t>0$,
+ \begin{align}
+ &\delta_1 < F(\theta_t) < \delta_2 \qquad \text{a.s.}\\
+ &\exists T > 0,\; \forall k > T:\quad \theta_{t+2k}< 1 <
+ \theta_{t+2k+1} \qquad \text{a.s.}
+ \end{align}
+ Where $\delta_1, \delta_2, T>0$ are constants.
+ TODO: give more accurate description of this garbage.
+\end{proposition}
+So if step sizes are large enough the \textbf{loss stabilizes} between level
+sets $\delta_1$, $\delta _2$ and after some initial phase the iterates bounce
+from one side of the \textbf{loss valley} to the other. Note the results
+holds \textbf{almost surely}.
+
+To further understand the effect of this loss stabilization the authors
+assume perfect stabilization and conjecture that during the loss
+stabilization, SGD is well modeled by GD with constant label noise.
+\newline
+Label noise dynamics have a connection with the Stochastic Differential
+Equations (SDEs)
+TODO: Stochastic differential equations connection to SGD.
+
+To properly write a model for the stochastic differential equation (SDE)
+dynamics, it needs to be considered that the drift needs to match the
+gradient descent and the noise should have the same covariance structure. By
+Proposition \ref{prop: loss-stab} the noise at step $\theta$ is spanned by
+$\{\nabla_{\theta}h_{\theta}(x_1),\ldots,\nabla_{\theta}h_{\theta}(x_n)\}$
+and has constant intensity corresponding to the loss stabilization at $\delta
+>0$. The following SDE model is proposed
+\begin{align}
+ d\theta_t = -\nabla_{\theta} \mathcal{L}(\theta_t)dt + \sqrt{\eta\delta}
+ \phi_{\theta_t}(X)^{T}dB_t,
+\end{align}
+where $(B_t)_{t\ge 0}$ is standard Brownian motion in $\mathbb{R}^{n}$ and
+$\phi_{\theta}(X) := [\nabla_{\theta}h_{\theta}(x_i)^{T}]_{i=1}^{n} \in
+\mathbb{R}^{n\times p}$ referred to as the Jacobian. This SDE can be
+interpreted as the effective slow dynamics that dives the iterates while they
+bounce rapidly in some directions at the level set $\delta$.
+\subsection{Sparse Feature Learning}
+It is begun with a simple example on diagonal linear networks to show a
+sparsity inducing dynamics and then further disclosed to a general message
+about the implicit bias prompted by the effective dynamics.
+\newline
+A diagonal linear network is a two-layer linear network with only diagonal
+connections: the prediction function is $h_{u,v}(x) = \langle u, v \odot
+x\rangle = \langle u \odot v, x\rangle$, where $\odot$ denotes the
+elementwise multiplication. In this case the loss function is convex but the
+linear predictor $\beta:=u\odot v \in \mathbb{R}^{d}$ is not in $(u, v)$.
+Hence we can see from this example a rich non-convex dynamics. Then $\nabla_u
+h_{u, v}(x) = v \odot x$ and the SDE model is
+\begin{align}
+ du_t = -\nabla_u \mathcal{L}(u_t, v_t) dt + \sqrt{\eta\delta} v_t \odot
+ [X^{T}dB_t],
+\end{align}
+where $(B_t)_{t\ge 0}$ is the standard Brownian motion in $\mathbb{R}^{n}$
+and the equations are symmetric for $(v_t)_{t\ge 0}$.
+
+The behavior of this effective dynamics shows from (Pillaud-Vivien et al.,
+2022) that the linear predictor $\beta_t = u_t \odot v_t$
+\begin{enumerate}
+ \item converges exponentially to zero outside of the support of $\beta^{*}$
+ \item is with high probability in a $\mathcal{O}(\sqrt{\eta\delta})$
+ neighborhood of $\beta^{*}$ in its support after time
+ $\mathcal{O}( \delta^{-1})$.
+\end{enumerate}
+
+The first phase of SGD with large step sizes $\eta$ decreases the traning
+loss until stabilization at some level set $\delta > 0$, and during this
+loss-stabilization an effective dynamics takes place. Shrinking the
+coordinates outside of the support of the sparsest signal and oscillates in
+the parameter space at level $\mathcal{O}(\sqrt{\eta\delta})$ on its support.
+Thereby decreasing the step size later leads to perfect recovery of the
+sparsest predictor. TODO: Experiments.
+
+\newline
+
+
+
+
+
+\section{Sparse Feature Learning}
+
+\section{Empirical Results}
+
+\section{Comparison of the Results}
+
+\section{Conclusion}
+
+
+
+\nocite{andriushchenko2023sgd}
+\nocite{shalev2014understanding}
+\nocite{fast_armijo_2022}
+\printbibliography
+
+\end{document}
diff --git a/opt_sem/summary/preamble.tex b/opt_sem/summary/preamble.tex
@@ -0,0 +1,285 @@
+\documentclass[a4paper]{article}
+
+\usepackage[T1]{fontenc}
+\usepackage[utf8]{inputenc}
+\usepackage{mlmodern}
+
+%\usepackage{ngerman} % Sprachanpassung Deutsch
+
+\usepackage{graphicx}
+\usepackage{geometry}
+\geometry{a4paper, top=15mm}
+
+\usepackage{subcaption}
+\usepackage[shortlabels]{enumitem}
+\usepackage{amssymb}
+\usepackage{amsthm}
+\usepackage{amsmath}
+\usepackage{mathtools}
+\usepackage{braket}
+\usepackage{bbm}
+\usepackage{graphicx}
+\usepackage{float}
+\usepackage{yhmath}
+\usepackage{tikz}
+\usepackage{scratch}
+\usetikzlibrary{patterns,decorations.pathmorphing,positioning}
+\usetikzlibrary{calc,decorations.markings}
+
+\usepackage[backend=biber, sorting=none]{biblatex}
+\addbibresource{cite.bib}
+
+\usepackage[framemethod=TikZ]{mdframed}
+
+\tikzstyle{titlered} =
+ [draw=black, thick, fill=white,%
+ text=black, rectangle,
+ right, minimum height=.7cm]
+
+
+\usepackage[colorlinks=true,naturalnames=true,plainpages=false,pdfpagelabels=true]{hyperref}
+\usepackage[parfill]{parskip}
+\usepackage{lipsum}
+
+\usepackage{tcolorbox}
+\tcbuselibrary{skins,breakable}
+\pagestyle{myheadings}
+\colorlet{colexam}{black}
+\numberwithin{equation}{subsection}
+
+\newcounter{definition}
+\newtcolorbox[use counter=definition, number within=subsection]{mydef}[1]{
+ empty,
+ title={\textbf{Definition~\thetcbcounter}~~(\textit{#1})},
+ attach boxed title to top left,
+ fontupper=\sl,
+ boxed title style={
+ empty,
+ size=minimal,
+ bottomrule=1pt,
+ top=1pt,
+ left skip=0cm,
+ overlay=
+ {\draw[colexam,line width=1pt]([yshift=-0.4cm]frame.north
+ west)--([yshift=-0.4cm]frame.north east);}},
+ coltitle=colexam,
+ fonttitle=\normalfont,
+ before=\par\medskip\noindent,
+ parbox=false,
+ boxsep=-1pt,
+ left=0.75cm,
+ right=3mm,
+ top=4pt,
+ breakable,
+ pad at break*=0mm,
+ vfill before first,
+ overlay unbroken={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=0pt]frame.north west)
+ --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); },
+ overlay first={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west); },
+ overlay last={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); }
+}
+
+\newcounter{theorem}
+\newtcolorbox[use counter=theorem, number within=subsection]{theorem}{
+ empty,
+ title={\textbf{Theorem~\thetcbcounter}},
+ attach boxed title to top left,
+ fontupper=\sl,
+ boxed title style={
+ empty,
+ size=minimal,
+ bottomrule=1pt,
+ top=1pt,
+ left skip=0cm,
+ overlay=
+ {\draw[colexam,line width=1pt]([yshift=-0.35cm]frame.north
+ west)--([yshift=-0.35cm]frame.north east);}},
+ coltitle=colexam,
+ fonttitle=\normalfont,
+ before=\par\medskip\noindent,
+ parbox=false,
+ boxsep=-1pt,
+ left=0.75cm,
+ right=3mm,
+ top=4pt,
+ breakable,
+ pad at break*=0mm,
+ vfill before first,
+ overlay unbroken={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); },
+ overlay first={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west); },
+ overlay last={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); }
+}
+\newcounter{lemma}
+\newtcolorbox[use counter=lemma, number within=subsection]{lemma}{
+ empty,
+ title={Lemma~\thetcbcounter},
+ attach boxed title to top left,
+ fontupper=\sl,
+ boxed title style={
+ empty,
+ size=minimal,
+ bottomrule=1pt,
+ top=1pt,
+ left skip=0cm,
+ overlay=
+ {\draw[colexam,line width=1pt]([yshift=-0.35cm]frame.north
+ west)--([yshift=-0.35cm]frame.north east);}},
+ coltitle=colexam,
+ fonttitle=\bfseries,
+ before=\par\medskip\noindent,
+ parbox=false,
+ boxsep=-1pt,
+ left=0.75cm,
+ right=3mm,
+ top=4pt,
+ breakable,
+ pad at break*=0mm,
+ vfill before first,
+ overlay unbroken={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); },
+ overlay first={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west); },
+ overlay last={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); }
+}
+\newcounter{proposition}
+\newtcolorbox[use counter=proposition, number within=subsection]{proposition}{
+ empty,
+ title={\textbf{Proposition~\thetcbcounter}},
+ attach boxed title to top left,
+ fontupper=\sl,
+ boxed title style={
+ empty,
+ size=minimal,
+ bottomrule=1pt,
+ top=1pt,
+ left skip=0cm,
+ overlay=
+ {\draw[colexam,line width=1pt]([yshift=-0.4cm]frame.north
+ west)--([yshift=-0.4cm]frame.north east);}},
+ coltitle=colexam,
+ fonttitle=\normalfont,
+ before=\par\medskip\noindent,
+ parbox=false,
+ boxsep=-1pt,
+ left=0.75cm,
+ right=3mm,
+ top=4pt,
+ breakable,
+ pad at break*=0mm,
+ vfill before first,
+ overlay unbroken={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); },
+ overlay first={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west); },
+ overlay last={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); }
+}
+\newcounter{myproof}
+\newtcolorbox[]{myproof}[1]{
+ empty,
+ title={\textbf{Proof}~\textit{#1}},
+ attach boxed title to top left,
+ fontupper=\sl,
+ boxed title style={
+ empty,
+ size=minimal,
+ bottomrule=1pt,
+ top=1pt,
+ left skip=0cm,
+ overlay=
+ {\draw[colexam,line width=1pt]([yshift=-0.35cm]frame.north
+ west)--([yshift=-0.35cm]frame.north east);}},
+ coltitle=colexam,
+ fonttitle=\normalfont,
+ before=\par\medskip\noindent,
+ parbox=false,
+ boxsep=-1pt,
+ left=0.75cm,
+ right=3mm,
+ top=4pt,
+ breakable,
+ pad at break*=0mm,
+ vfill before first,
+ overlay unbroken={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); },
+ overlay first={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west); },
+ overlay last={
+ \draw[colexam,line width=1pt]
+ ([xshift=0.6cm, yshift=-0.5pt]frame.south
+ west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+ --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east);
+ \draw[colexam,line width=0.5pt]
+ ([xshift=-1.5cm,yshift=0.5cm]frame.south east)
+ --([xshift=-1.25cm, yshift=0.5cm]frame.south east)
+ --([xshift=-1.25cm, yshift=0.25cm]frame.south east)
+ --([xshift=-1.5cm, yshift=0.25cm]frame.south east)
+ --([xshift=-1.5cm, yshift=0.5cm]frame.south east);
+ }
+}
+
+\newcommand{\eps}{\varepsilon}
+\usepackage[OT2,T1]{fontenc}
+\DeclareSymbolFont{cyrletters}{OT2}{wncyr}{m}{n}
+\DeclareMathSymbol{\Sha}{\mathalpha}{cyrletters}{"58}
+
+\markright{Popović\hfill Seminar\hfill}
+
+
+\title{University of Vienna\\
+\vspace{1cm}Seminar:\\Optimization\\
+\vspace{0.5cm}
+SGD with Large Step Sizes Learns Sparse Features
+}
+\author{
+ Milutin Popovic \\
+ Supervisor: Radu Ioan Boţ
+}
diff --git a/opt_sem/todo.md b/opt_sem/todo.md
@@ -0,0 +1,6 @@
+TODO:
+ * summarize the theory
+ * run the results
+ * see about different step size strategies
+ * compare them with the large one in the paper
+