start writing - notes

commit b226f751a4b9fadd7e6bc446ae8099f6ddaefae1
parent d15be9da1c4ba20b46373578c42a8d1a2bdca271
Author: miksa234 <milutin@popovic.xyz>
Date:   Mon, 25 Dec 2023 18:02:55 +0000

start writing

Diffstat:
R opt_sem/andriushchenko23b.pdf -> opt_sem/refs/andriushchenko23b.pdf  | 0 
A opt_sem/refs/sgd_conv.pdf  | 0 
A opt_sem/refs/sgd_sde.pdf  | 0 
A opt_sem/refs/sgd_sss.pdf  | 0 
R opt_sem/understanding_machine_learning.pdf -> opt_sem/refs/understanding_machine_learning.pdf  | 0 
A opt_sem/summary/cite.bib  | 28 ++++++++++++++++++++++++++++
A opt_sem/summary/main.tex  | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A opt_sem/summary/preamble.tex  | 285 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A opt_sem/todo.md  | 6 ++++++

9 files changed, 506 insertions(+), 0 deletions(-)
diff --git a/opt_sem/andriushchenko23b.pdf b/opt_sem/refs/andriushchenko23b.pdf
Binary files differ.
diff --git a/opt_sem/refs/sgd_conv.pdf b/opt_sem/refs/sgd_conv.pdf
Binary files differ.
diff --git a/opt_sem/refs/sgd_sde.pdf b/opt_sem/refs/sgd_sde.pdf
Binary files differ.
diff --git a/opt_sem/refs/sgd_sss.pdf b/opt_sem/refs/sgd_sss.pdf
Binary files differ.
diff --git a/opt_sem/understanding_machine_learning.pdf b/opt_sem/refs/understanding_machine_learning.pdf
Binary files differ.
diff --git a/opt_sem/summary/cite.bib b/opt_sem/summary/cite.bib
@@ -0,0 +1,28 @@
+@misc{andriushchenko2023sgd,
+    title={SGD with Large Step Sizes Learns Sparse Features},
+    author={Maksym Andriushchenko and Aditya Varre and Loucas Pillaud-Vivien and Nicolas Flammarion},
+    year={2023},
+    eprint={2210.05337},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}
+
+@article{fast_armijo_2022,
+    author = {Hafshejani, Sajad and Gaur, Daya and Hossain, Shahadat and Benkoczi, Robert},
+    year = {2022},
+    month = {11},
+    pages = {},
+    title = {Fast Armijo line search for stochastic gradient descent},
+    doi = {10.21203/rs.3.rs-2285238/v1}
+}
+
+@book{shalev2014understanding,
+    title={Understanding Machine Learning: From Theory to Algorithms},
+    author={Shalev-Shwartz, S. and Ben-David, S.},
+    isbn={9781107057135},
+    lccn={2014001779},
+    series={Understanding Machine Learning: From Theory to Algorithms},
+    url={https://books.google.pt/books?id=ttJkAwAAQBAJ},
+    year={2014},
+    publisher={Cambridge University Press}
+}
diff --git a/opt_sem/summary/main.tex b/opt_sem/summary/main.tex
@@ -0,0 +1,187 @@
+\include{./preamble.tex}
+
+
+\begin{document}
+
+\maketitle
+
+\tableofcontents
+
+\section{Introduction}
+Large step sizes may lead the loss to stabilize by making SGD bounce above a
+valley. Showcase is done with mean square error. Consider a family of
+prediction functions $\mathcal{H} := \{x \to h_\theta(x), \theta \in
+\mathbb{R}^{p}\}$. The training loss wrt. input/output samples $(x_i,
+y_i)_{i=1}^{n} \in \mathbb{R}^{d}\times\mathbb{R}$ is
+\begin{align}
+    \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1} \left( h_\theta(x_i) -
+    y_i \right)^{2}.
+\end{align}
+The setting $p \gg n$, i.e. the overparametrized setting is considered where
+there exist many parameters $\theta^{*}$ that lead to zero loss or perfectly
+interpolated the dataset. To find the minimizers of the risk function we
+consider a SGD recursion with step size $\eta > 0$, with initial $\theta_0
+\in \mathbb{R}^{p}$, for all $t \in \mathbb{N}$
+\begin{align}
+    \theta_{t+1} = \theta_t - \eta\left(h_{\theta_t}(x_{i_t})
+    - y_{i_t}\right)
+    \nabla_{\theta} h_{\theta_t}(x_{i_t}), \label{eq: sgd_it}
+\end{align}
+where $i_t \sim U(\{1,\ldots,n\})$, is a random variable following the
+discrete uniform distribution over a sample of indices.<
+\subsection{GD and SGD Relation}
+The authors highlight the importance of gradient and noise, by explaining the
+connection between the SGD dynamics and full batch GD plus a specific label
+noise.
+
+\begin{proposition}
+    Let $(\theta_t)_{t\ge 0 }$ follow the SGD dynamics of \ref{eq: sgd_it},
+    with the random sampling function $(i_t)_{t\ge_0}$. For $t\ge 0$ define
+    the random vector $\xi_t \in \mathbb{R}^{n}$ s.t.:
+    \begin{align}
+        [\xi_t]_i := (h_{\theta_t}(x_i) - y_i)(1-n\mathbf{1}_{i=i_t}).
+    \end{align}
+    For $i \in \{1,\ldots,n\}$ and $\mathbf{1}_{A}$ is the indicator function
+    of event $A$. Then $(\theta_t)_{\theta\ge )}$ follows the full batch
+    gradient descent dynamics on $\mathcal{L}$ with label noise
+    $(\xi_t)_{t\ge 0}$, i.e.
+    \begin{align}
+        \theta_{t+1} = \theta_t - \frac{\eta}{n} \sum_{i=1}^{n}
+        \left( h_{\theta_t}(x_i) - y_i^{t} \right)
+        \nabla_{\theta}h_{\theta_t}(x_i),
+    \end{align}
+    where we define the random labels $y^{t} := y + \xi_t$. Furthermore
+    $\xi_t$ is a mean zero random vector with variance such that
+    $\frac{1}{n(n-1)}\mathbb{E}\|\xi_t\|^{2} = 2 \mathcal{L}(\theta_t)$.
+\end{proposition}
+Two important things are are shown in the above Proposition.
+\begin{enumerate}
+    \item The noisy part at state $\theta$ always belongs to the linear space
+        spanned by $\{\nabla_\theta h_\theta(x_1),\ldots, \nabla_\theta
+        h_\theta(x_n)\}$.
+
+    \item The loss can stabilize because of large step sizes, this may lead
+        to constant effective scale of label noise, which is clearly
+        explained thorough out the paper summary \cite{andriushchenko2023sgd}.
+\end{enumerate}
+\section{Dynamics behind Loss Stabilization}
+For the generic quadratic loss $F(\beta) := \|X\beta - y\|^{2}$, gradient
+descent converges with step size $\eta < \frac{2}{\lambda_{\text{max}}}$,
+diverges for $\eta > \frac{2}{\lambda_{\text{max}}}$ and converges to a
+bouncing 2-periodic dynamics for $\eta = \frac{2}{\lambda_{\text{max}}}$,
+where $\lambda_{\text{max}}$ is the largest eigenvalue of the Hessian. On the
+other for hand nonquadratic loss there exists an open interval of the step
+sizes for witch the GD algorithm neither converges nor diverges \cite{andriushchenko2023sgd}.
+Complementing this with an example were the loss stabilization occurs almost
+surely in the case of SGD. A regression example with quadratic
+parametrization on the one dimensional data inputs $x_i$ from a distribution
+$\hat{\rho}$ and inputs generated by a linear model $y_i =
+x_i\theta_{*}^{2}$. With the loss $F(\theta) := \frac{1}{4}
+\mathbb{E}_{\hat{\rho}}(y - x\theta^{2})^{2}$, the std iterates with step
+size $\eta>0$ follow for $ t \in \mathbb{N}$
+\begin{align}
+    \theta_{t+1} + \theta_t + \eta \theta_t x_{i_t}(y_{i_t} -
+    x_{i_t}\theta_\text{t}^{2}).
+\end{align}
+W.l.o.g., consider $\theta_* = 1 $ and $\text{supp}(\hat{\rho})=[a,b] \subset
+\mathbb{R}$. Then the following proposition holds
+
+\begin{proposition}
+    \label{prop: loss-stab}
+    For any $\eta \in (a^{-2}, 1.25b^{-2})$ and initialization $\theta_0 \in
+    (0, 1)$ for all $t>0$,
+    \begin{align}
+        &\delta_1 < F(\theta_t) < \delta_2 \qquad \text{a.s.}\\
+        &\exists T > 0,\; \forall k > T:\quad \theta_{t+2k}< 1 <
+        \theta_{t+2k+1} \qquad \text{a.s.}
+    \end{align}
+    Where $\delta_1, \delta_2, T>0$ are constants.
+    TODO: give more accurate description of this garbage.
+\end{proposition}
+So if step sizes are large enough the \textbf{loss stabilizes} between level
+sets $\delta_1$, $\delta _2$ and after some initial phase the iterates bounce
+from one side of the \textbf{loss valley} to the other. Note the results
+holds \textbf{almost surely}.
+
+To further understand the effect of this loss stabilization the authors
+assume perfect stabilization and conjecture that during the loss
+stabilization, SGD is well modeled by GD with constant label noise.
+\newline
+Label noise dynamics have a connection with the Stochastic Differential
+Equations (SDEs)
+TODO: Stochastic differential equations connection to SGD.
+
+To properly write a model for the stochastic differential equation (SDE)
+dynamics, it needs to be considered that the drift needs to match the
+gradient descent and the noise should have the same covariance structure. By
+Proposition \ref{prop: loss-stab} the noise at step $\theta$ is spanned by
+$\{\nabla_{\theta}h_{\theta}(x_1),\ldots,\nabla_{\theta}h_{\theta}(x_n)\}$
+and has constant intensity corresponding to the loss stabilization at $\delta
+>0$. The following SDE model is proposed
+\begin{align}
+    d\theta_t = -\nabla_{\theta} \mathcal{L}(\theta_t)dt + \sqrt{\eta\delta}
+    \phi_{\theta_t}(X)^{T}dB_t,
+\end{align}
+where $(B_t)_{t\ge 0}$ is standard Brownian motion in $\mathbb{R}^{n}$ and
+$\phi_{\theta}(X) := [\nabla_{\theta}h_{\theta}(x_i)^{T}]_{i=1}^{n} \in
+\mathbb{R}^{n\times p}$ referred to as the Jacobian. This SDE can be
+interpreted as the effective slow dynamics that dives the iterates while they
+bounce rapidly in some directions at the level set $\delta$.
+\subsection{Sparse Feature Learning}
+It is begun with a simple example on diagonal linear networks to show a
+sparsity inducing dynamics and then further disclosed to a general message
+about the implicit bias prompted by the effective dynamics.
+\newline
+A diagonal linear network is a two-layer linear network with only diagonal
+connections: the prediction function is $h_{u,v}(x) = \langle u, v \odot
+x\rangle = \langle u \odot v, x\rangle$, where $\odot$ denotes the
+elementwise multiplication. In this case the loss function is convex but the
+linear predictor $\beta:=u\odot v \in \mathbb{R}^{d}$ is not in $(u, v)$.
+Hence we can see from this example a rich non-convex dynamics. Then $\nabla_u
+h_{u, v}(x) = v \odot x$  and the SDE model is
+\begin{align}
+    du_t = -\nabla_u \mathcal{L}(u_t, v_t) dt + \sqrt{\eta\delta} v_t \odot
+    [X^{T}dB_t],
+\end{align}
+where $(B_t)_{t\ge 0}$ is the standard Brownian motion in $\mathbb{R}^{n}$
+and the equations are symmetric for $(v_t)_{t\ge 0}$.
+
+The behavior of this effective dynamics shows from (Pillaud-Vivien et al.,
+2022) that the linear predictor $\beta_t = u_t \odot v_t$
+\begin{enumerate}
+    \item converges exponentially to zero outside of the support of $\beta^{*}$
+    \item is with high probability in a $\mathcal{O}(\sqrt{\eta\delta})$
+        neighborhood of $\beta^{*}$ in its support after time
+        $\mathcal{O}( \delta^{-1})$.
+\end{enumerate}
+
+The first phase of SGD with large step sizes $\eta$ decreases the traning
+loss until stabilization at some level set $\delta > 0$, and during this
+loss-stabilization an effective dynamics takes place. Shrinking the
+coordinates outside of the support of the sparsest signal and oscillates in
+the parameter space at level $\mathcal{O}(\sqrt{\eta\delta})$ on its support.
+Thereby decreasing the step size later leads to perfect recovery of the
+sparsest predictor. TODO: Experiments.
+
+\newline
+
+
+
+
+
+\section{Sparse Feature Learning}
+
+\section{Empirical Results}
+
+\section{Comparison of the Results}
+
+\section{Conclusion}
+
+
+
+\nocite{andriushchenko2023sgd}
+\nocite{shalev2014understanding}
+\nocite{fast_armijo_2022}
+\printbibliography
+
+\end{document}
diff --git a/opt_sem/summary/preamble.tex b/opt_sem/summary/preamble.tex
@@ -0,0 +1,285 @@
+\documentclass[a4paper]{article}
+
+\usepackage[T1]{fontenc}
+\usepackage[utf8]{inputenc}
+\usepackage{mlmodern}
+
+%\usepackage{ngerman}	% Sprachanpassung Deutsch
+
+\usepackage{graphicx}
+\usepackage{geometry}
+\geometry{a4paper, top=15mm}
+
+\usepackage{subcaption}
+\usepackage[shortlabels]{enumitem}
+\usepackage{amssymb}
+\usepackage{amsthm}
+\usepackage{amsmath}
+\usepackage{mathtools}
+\usepackage{braket}
+\usepackage{bbm}
+\usepackage{graphicx}
+\usepackage{float}
+\usepackage{yhmath}
+\usepackage{tikz}
+\usepackage{scratch}
+\usetikzlibrary{patterns,decorations.pathmorphing,positioning}
+\usetikzlibrary{calc,decorations.markings}
+
+\usepackage[backend=biber, sorting=none]{biblatex}
+\addbibresource{cite.bib}
+
+\usepackage[framemethod=TikZ]{mdframed}
+
+\tikzstyle{titlered} =
+    [draw=black, thick, fill=white,%
+        text=black, rectangle,
+        right, minimum height=.7cm]
+
+
+\usepackage[colorlinks=true,naturalnames=true,plainpages=false,pdfpagelabels=true]{hyperref}
+\usepackage[parfill]{parskip}
+\usepackage{lipsum}
+
+\usepackage{tcolorbox}
+\tcbuselibrary{skins,breakable}
+\pagestyle{myheadings}
+\colorlet{colexam}{black}
+\numberwithin{equation}{subsection}
+
+\newcounter{definition}
+\newtcolorbox[use counter=definition, number within=subsection]{mydef}[1]{
+    empty,
+    title={\textbf{Definition~\thetcbcounter}~~(\textit{#1})},
+    attach boxed title to top left,
+    fontupper=\sl,
+    boxed title style={
+        empty,
+        size=minimal,
+        bottomrule=1pt,
+        top=1pt,
+        left skip=0cm,
+        overlay=
+            {\draw[colexam,line width=1pt]([yshift=-0.4cm]frame.north
+        west)--([yshift=-0.4cm]frame.north east);}},
+            coltitle=colexam,
+            fonttitle=\normalfont,
+            before=\par\medskip\noindent,
+            parbox=false,
+            boxsep=-1pt,
+            left=0.75cm,
+            right=3mm,
+            top=4pt,
+            breakable,
+            pad at break*=0mm,
+            vfill before first,
+            overlay unbroken={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=0pt]frame.north west)
+                --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); },
+            overlay first={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west); },
+            overlay last={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); }
+}
+
+\newcounter{theorem}
+\newtcolorbox[use counter=theorem, number within=subsection]{theorem}{
+    empty,
+    title={\textbf{Theorem~\thetcbcounter}},
+    attach boxed title to top left,
+    fontupper=\sl,
+    boxed title style={
+        empty,
+        size=minimal,
+        bottomrule=1pt,
+        top=1pt,
+        left skip=0cm,
+        overlay=
+            {\draw[colexam,line width=1pt]([yshift=-0.35cm]frame.north
+        west)--([yshift=-0.35cm]frame.north east);}},
+            coltitle=colexam,
+            fonttitle=\normalfont,
+            before=\par\medskip\noindent,
+            parbox=false,
+            boxsep=-1pt,
+            left=0.75cm,
+            right=3mm,
+            top=4pt,
+            breakable,
+            pad at break*=0mm,
+            vfill before first,
+            overlay unbroken={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); },
+            overlay first={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west); },
+            overlay last={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); }
+}
+\newcounter{lemma}
+\newtcolorbox[use counter=lemma, number within=subsection]{lemma}{
+    empty,
+    title={Lemma~\thetcbcounter},
+    attach boxed title to top left,
+    fontupper=\sl,
+    boxed title style={
+        empty,
+        size=minimal,
+        bottomrule=1pt,
+        top=1pt,
+        left skip=0cm,
+        overlay=
+            {\draw[colexam,line width=1pt]([yshift=-0.35cm]frame.north
+        west)--([yshift=-0.35cm]frame.north east);}},
+            coltitle=colexam,
+            fonttitle=\bfseries,
+            before=\par\medskip\noindent,
+            parbox=false,
+            boxsep=-1pt,
+            left=0.75cm,
+            right=3mm,
+            top=4pt,
+            breakable,
+            pad at break*=0mm,
+            vfill before first,
+            overlay unbroken={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); },
+            overlay first={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west); },
+            overlay last={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); }
+}
+\newcounter{proposition}
+\newtcolorbox[use counter=proposition, number within=subsection]{proposition}{
+    empty,
+    title={\textbf{Proposition~\thetcbcounter}},
+    attach boxed title to top left,
+    fontupper=\sl,
+    boxed title style={
+        empty,
+        size=minimal,
+        bottomrule=1pt,
+        top=1pt,
+        left skip=0cm,
+        overlay=
+            {\draw[colexam,line width=1pt]([yshift=-0.4cm]frame.north
+        west)--([yshift=-0.4cm]frame.north east);}},
+            coltitle=colexam,
+            fonttitle=\normalfont,
+            before=\par\medskip\noindent,
+            parbox=false,
+            boxsep=-1pt,
+            left=0.75cm,
+            right=3mm,
+            top=4pt,
+            breakable,
+            pad at break*=0mm,
+            vfill before first,
+            overlay unbroken={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); },
+            overlay first={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west); },
+            overlay last={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); }
+}
+\newcounter{myproof}
+\newtcolorbox[]{myproof}[1]{
+    empty,
+    title={\textbf{Proof}~\textit{#1}},
+    attach boxed title to top left,
+    fontupper=\sl,
+    boxed title style={
+        empty,
+        size=minimal,
+        bottomrule=1pt,
+        top=1pt,
+        left skip=0cm,
+        overlay=
+            {\draw[colexam,line width=1pt]([yshift=-0.35cm]frame.north
+        west)--([yshift=-0.35cm]frame.north east);}},
+            coltitle=colexam,
+            fonttitle=\normalfont,
+            before=\par\medskip\noindent,
+            parbox=false,
+            boxsep=-1pt,
+            left=0.75cm,
+            right=3mm,
+            top=4pt,
+            breakable,
+            pad at break*=0mm,
+            vfill before first,
+            overlay unbroken={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east); },
+            overlay first={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west); },
+            overlay last={
+                \draw[colexam,line width=1pt]
+                ([xshift=0.6cm, yshift=-0.5pt]frame.south
+                west)--([xshift=0.6cm,yshift=-1pt]frame.north west)
+                --([xshift=0.6cm]frame.south west)--([xshift=-13cm]frame.south east);
+                \draw[colexam,line width=0.5pt]
+                ([xshift=-1.5cm,yshift=0.5cm]frame.south east)
+                --([xshift=-1.25cm, yshift=0.5cm]frame.south east)
+                --([xshift=-1.25cm, yshift=0.25cm]frame.south east)
+                --([xshift=-1.5cm, yshift=0.25cm]frame.south east)
+                --([xshift=-1.5cm, yshift=0.5cm]frame.south east);
+            }
+}
+
+\newcommand{\eps}{\varepsilon}
+\usepackage[OT2,T1]{fontenc}
+\DeclareSymbolFont{cyrletters}{OT2}{wncyr}{m}{n}
+\DeclareMathSymbol{\Sha}{\mathalpha}{cyrletters}{"58}
+
+\markright{Popović\hfill Seminar\hfill}
+
+
+\title{University of Vienna\\
+\vspace{1cm}Seminar:\\Optimization\\
+\vspace{0.5cm}
+SGD with Large Step Sizes Learns Sparse Features
+}
+\author{
+    Milutin Popovic \\
+    Supervisor: Radu Ioan Boţ
+}
diff --git a/opt_sem/todo.md b/opt_sem/todo.md
@@ -0,0 +1,6 @@
+TODO:
+    * summarize the theory
+    * run the results
+    * see about different step size strategies
+    * compare them with the large one in the paper
+

	notes uni notes
	git clone git://popovic.xyz/notes.git
	Log \| Files \| Refs

R	opt_sem/andriushchenko23b.pdf -> opt_sem/refs/andriushchenko23b.pdf	\|	0
A	opt_sem/refs/sgd_conv.pdf	\|	0
A	opt_sem/refs/sgd_sde.pdf	\|	0
A	opt_sem/refs/sgd_sss.pdf	\|	0
R	opt_sem/understanding_machine_learning.pdf -> opt_sem/refs/understanding_machine_learning.pdf	\|	0
A	opt_sem/summary/cite.bib	\|	28	++++++++++++++++++++++++++++
A	opt_sem/summary/main.tex	\|	187	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	opt_sem/summary/preamble.tex	\|	285	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	opt_sem/todo.md	\|	6	++++++