notes

uni notes
git clone git://popovic.xyz/notes.git
Log | Files | Refs

main.tex (20963B)


      1 \include{./preamble.tex}
      2 
      3 
      4 \begin{document}
      5 
      6 \maketitle
      7 
      8 \tableofcontents
      9 
     10 \section{Introduction}
     11 Large step sizes may lead the loss to stabilize by making Stochastic Gradient
     12 Descent (SGD) bounce above a valley.
     13 
     14 The showcase is done with mean square error. Considering a family of
     15 prediction functions $\mathcal{H} := \{x \to h_\theta(x), \theta \in
     16 \mathbb{R}^{p}\}$, The training loss wrt. input/output samples $(x_i,
     17 y_i)_{i=1}^{n} \in \mathbb{R}^{d}\times\mathbb{R}$ is
     18 \begin{align}
     19     \mathcal{L}(\theta) := \frac{1}{2n} \sum_{i=1}^{n} \left( h_\theta(x_i) -
     20     y_i \right)^{2}.
     21 \end{align}
     22 The setting $p \gg n$, i.e. the overparametrized setting is considered where
     23 there exist many parameters $\theta^{*}$ that lead to zero loss or perfectly
     24 interpolated the dataset. To find the minimizers of the risk function we
     25 consider a SGD recursion with step size $\eta > 0$, with initial $\theta_0
     26 \in \mathbb{R}^{p}$, for all $t \in \mathbb{N}$
     27 \begin{align}
     28     \theta_{t+1} = \theta_t - \eta\left(h_{\theta_t}(x_{i_t})
     29     - y_{i_t}\right)
     30     \nabla_{\theta} h_{\theta_t}(x_{i_t}), \label{eq: sgd_it}
     31 \end{align}
     32 where $i_t \sim U(\{1,\ldots,n\})$, is a random variable following the
     33 discrete uniform distribution over a sample of indices. The main focus of
     34 this summary is about the parameter $\eta$ in literature called \textit{step
     35 size} or \textit{learning rate}. Authors of the paper
     36 \cite{andriushchenko2023sgd} conjecture that larger step sizes lead to a so
     37 called \textit{loss stabilization}, where the loss will almost surely stay
     38 around a level set. They argue that this so called loss stabilization also
     39 causes sparse feature learning, i.e. that the learned prediction function has
     40 a better \textit{prediction performance} considering sparse (mostly zero)
     41 entry input data. First they prove their conjecture on simplified model
     42 rigorously and then they give empirical evidence for more complex cases.
     43 This summary tries to give an overview and background required to understand
     44 what the paper is trying to achieve.
     45 \subsection{GD and SGD Relation}
     46 The authors highlight the importance of gradient and noise, by explaining the
     47 connection between the SGD dynamics and full batch GD plus a specific label
     48 noise.
     49 
     50 \begin{proposition}
     51     Let $(\theta_t)_{t\ge 0 }$ follow the SGD dynamics of \ref{eq: sgd_it},
     52     with the random sampling function $(i_t)_{t\ge_0}$. For $t\ge 0$ define
     53     the random vector $\xi_t \in \mathbb{R}^{n}$ s.t.:
     54     \begin{align}
     55         [\xi_t]_i := (h_{\theta_t}(x_i) - y_i)(1-n\mathbf{1}_{i=i_t}).
     56     \end{align}
     57     For $i \in \{1,\ldots,n\}$ and $\mathbf{1}_{A}$ is the indicator function
     58     of event $A$. Then $(\theta_t)_{\theta\ge )}$ follows the full batch
     59     gradient descent dynamics on $\mathcal{L}$ with label noise
     60     $(\xi_t)_{t\ge 0}$, i.e.
     61     \begin{align}
     62         \theta_{t+1} = \theta_t - \frac{\eta}{n} \sum_{i=1}^{n}
     63         \left( h_{\theta_t}(x_i) - y_i^{t} \right)
     64         \nabla_{\theta}h_{\theta_t}(x_i),
     65     \end{align}
     66     where we define the random labels $y^{t} := y + \xi_t$. Furthermore
     67     $\xi_t$ is a mean zero random vector with variance such that
     68     $\frac{1}{n(n-1)}\mathbb{E}\|\xi_t\|^{2} = 2 \mathcal{L}(\theta_t)$.
     69 \end{proposition}
     70 Two important things are are shown in the above Proposition.
     71 \begin{enumerate}
     72     \item The noisy part at state $\theta$ always belongs to the linear space
     73         spanned by $\{\nabla_\theta h_\theta(x_1),\ldots, \nabla_\theta
     74         h_\theta(x_n)\}$.
     75 
     76     \item The loss can stabilize because of large step sizes, this may lead
     77         to constant effective scale of label noise, which is clearly
     78         explained thorough out the paper summary \cite{andriushchenko2023sgd}.
     79 \end{enumerate}
     80 \section{Loss Stabilization: Toy Model}
     81 For quadratic loss
     82 \begin{align}
     83 F(\beta) := \|X\beta - y\|^{2},
     84 \end{align}
     85 gradient descent
     86 \begin{itemize}
     87     \item  converges with step size $\eta < \frac{2}{\lambda_{\text{max}}}$,
     88     \item diverges for $\eta > \frac{2}{\lambda_{\text{max}}}$
     89     \item  and converges to a bouncing 2-periodic dynamics for $\eta =
     90         \frac{2}{\lambda_{\text{max}}}$,
     91 \end{itemize}
     92 where $\lambda_{\text{max}}$ is the largest eigenvalue of the Hessian. On the
     93 other hand, for nonquadratic loss there exists an open interval of the step
     94 sizes for witch the GD algorithm neither converges nor diverges
     95 \cite{andriushchenko2023sgd}. Complementing this with an example were the
     96 loss stabilization occurs \textbf{almost surely} in the case of SGD.
     97 \newline
     98 
     99 A regression example with quadratic parametrization on the one dimensional
    100 data inputs $x_i \sim \hat{\rho}$ and inputs generated by a linear model $y_i
    101 = x_i\theta_{*}^{2}$. With the loss $F(\theta) := \frac{1}{4}
    102 \mathbb{E}_{\hat{\rho}}(y - x\theta^{2})^{2}$, the STD iterates with step
    103 size $\eta>0$ are for $ t \in \mathbb{N}$
    104 \begin{align}
    105     \theta_{t+1} + \theta_t + \eta \theta_t x_{i_t}(y_{i_t} -
    106     x_{i_t}\theta_\text{t}^{2}).
    107 \end{align}
    108 In this case the following holds
    109 \begin{proposition}
    110     \label{prop: loss-stab}
    111     Assuming $\exists\; x_{\text{min}}, x_{\text{max}} > 0$ s.t.
    112     $\text{supp}(\hat{\rho}) \subset [x_{\text{min}}, x_{\text{max}}]$. Then
    113     for any $\eta \in ((\theta_*x_{\text{min}})^{-2},
    114     1.25(\theta_*x_{\text{max}})^{-2})$, and any initial $\theta_0 \in (0,
    115     \theta_*)$, for any $t \in \mathbb{N}$ we have
    116     \begin{align}
    117         F(\theta_t) \in (\varepsilon_0\theta_*^{2}, 0.17\theta_*^{2}) \quad
    118         \text{\textbf{almost surely}},
    119     \end{align}
    120     where $\varepsilon_0 = \min \{\frac{\eta(\theta_* x_{\text{min})^{2}-
    121     1}}{3}, 0.02\}$. And \textbf{almost surely} there are $t, k > 0 $ s.t.:
    122     \begin{align}
    123         &\theta_{t+2k} \in (0.65\theta_*, (1-\varepsilon_0)\theta_*) \quad
    124         \text{and}\\
    125         &\theta_{t+2k+1} \in ((1-\varepsilon_0)\theta_*, 1.162\theta_*)
    126     \end{align}
    127 \end{proposition}
    128 So if step sizes are large enough the \textbf{loss stabilizes} between level
    129 sets $\delta_1$, $\delta _2$ and after some initial phase the iterates bounce
    130 from one side of the \textbf{loss valley} to the other. Note the results
    131 holds \textbf{almost surely}. The proof idea is first to normalize the SGD
    132 recursion $\theta_t \to \theta_t / \theta_*$, i.e.
    133 \begin{align}
    134     \theta_{t+1} = \theta_t + \gamma\theta_t\left(
    135     1-\theta_t^{2} \right),
    136 \end{align}
    137 where $\gamma \sim \hat{\rho}_\gamma$ is the pushforward of $\hat{\rho}$
    138 under $z \mapsto \eta \theta_* z^{2}$ and then the interval that we
    139 considered of $\eta$, becomes that of $\gamma$ by
    140 $\text{supp}(\hat{\rho}_\gamma) \subseteq (1, 1.25)$. Then we devide the
    141 interval of $(0, 1.162)$ into $4$ regions
    142 \begin{align}
    143     &I_0 = (0, 0.65],\\
    144     &I_1 = (0.65, 1-\varepsilon),\\
    145     &I_2 = (1-\varepsilon, 1),\\
    146     &I_3 = (1, 1.162),
    147 \end{align}
    148 where note that we have for all $0< \varepsilon < \varepsilon_0$ the
    149 inequality $\gamma_{\text{min}} (2-\varepsilon) (1-\varepsilon) >2$ for
    150 $\gamma_\text{min} = \inf(\text{supp}\left(\hat{\rho}_\gamma  \right) )$.
    151 It is to be shown that all iterates end up in $I_1$ and leave  and come back
    152 to $I_1$ after 2 steps, which is divided into 4 steps
    153 \begin{enumerate}
    154     \item There is a $t\ge 0: \theta_t \in I_1 \cup I_2 \cup I_3$
    155     \item For $\theta_t \in I_3$ then $\theta_{t+1} \in I_1 \cup I_3$
    156     \item For $\theta_t \in I_2$ there is a $k>0$: $\forall k' > k$ it holds
    157         that $\theta_{t+2k'} \in I_2$ and $\theta_{t+2k} \in I_1$.
    158     \item For $\theta_t \in I_1$, then $\forall k \ge 0$, it
    159         holds that
    160         $\theta_{t+2k} \in I_1$ and $\theta_{t+2k+1} \in
    161         (1+\varepsilon, 1.162)$.
    162 \end{enumerate}
    163 To show 1) we it needs to be shown that the function $h_\gamma(\theta)
    164 =\theta - \gamma \theta(1- \theta^{2})$ for $\gamma(1,1.25)$ stays in $(0,
    165 1.162)$. To show 2), for $\theta \in (1, 1.162)$ note that $h_\gamma(\theta)$
    166 is linear in $\gamma$ for $\theta>1$, decreasing as $\gamma$ increases then
    167 \begin{align}
    168     0.652 = h_{1.25}(1.162) < h_\gamma(1.162) < g_\gamma(\theta) <
    169     g_\gamma(1) = 1.
    170 \end{align}
    171 The points 3 \& 4 can be shown in a similar way with simple analysis.
    172 
    173 
    174 \section{SGD iteration and Stochastic Differential Equations(SDEs)}
    175 To further understand the effect of this loss stabilization the authors
    176 assume perfect stabilization and conjecture that during the loss
    177 stabilization, SGD is well modeled by GD with constant label noise.
    178 \newline
    179 Label noise dynamics have a connection with the Stochastic Differential
    180 Equations (SDEs), \cite{li2018stochastic} shows the motivation in this
    181 approach. By rewriting the SGD iteration with
    182 \begin{align}
    183     V_t(\theta_t, i_t) = \sqrt{\eta} \left(f(\theta_t) - \nabla
    184     f_{i_t}(\theta_k)  \right),
    185 \end{align}
    186 where in our case $f$ is the loss function $f_{i_k}$ is the selected function
    187 in the sum to estimate the gradient, then the SGD iteration takes the
    188 following form
    189 \begin{align}
    190     \theta_{t+1} = \theta_t - \eta \nabla f(\theta_t) + \eta
    191     V_t(\theta_t, i_t).
    192 \end{align}
    193 A straight forward calculation shows \cite{li2018stochastic}:
    194 \begin{align}
    195     &\mathbb{E}(V_t|\theta_t) = 0, \\
    196     &\text{cov}(V_t, V_t|\theta_t) = \eta \Sigma(\theta_t), \\
    197     &\Sigma(\theta_k) :=
    198     \mathbb{E}\left(\frac{V_t^{2}}{\eta}\Big|\theta_t \right)
    199 \end{align}
    200 On the other hand consider an time-homogeneous It\^o type SDE for $\tau>0$
    201 \begin{align}
    202     \label{eq: sde-sgd-dynamics}
    203     d\theta\tau = b(\theta_\tau)d\tau
    204     +\sqrt{\eta}\sigma(\theta_\tau)dB_\tau,
    205 \end{align}
    206 where $B_\tau$ is the standard p-dimensional Brownian motion, $\b:
    207 \mathbb{R}^{p}\to \mathbb{R}^{p}$ is called the drift and $\sigma:
    208 \mathbb{R}^{p} \to \mathbb{R}^{p\times p}$ is the diffusion matrix. Applying
    209 the Euler discretization scheme with step size $\eta$ and approximating
    210 $\theta_{\tau\eta}$ with $\hat{\eta}_\tau$ we get
    211 \begin{align}
    212     \hat{\theta}_t= \hat{\theta}_t + \eta b(\theta_t)
    213     +\eta \sigma(\hat{\theta}_t)Z_t.
    214 \end{align}
    215 In a similar way setting $b = -\nabla_\theta f $ and $\sigma(\theta) =
    216 \Sigma(\theta)^{\frac{1}{2}}$ we get
    217 \begin{align}
    218     d\theta_\tau = -\nabla f(\theta_\tau)d\tau
    219     + (\eta\Sigma(\theta_\tau))^{\frac{1}{2}} dB_t.
    220 \end{align}
    221 And for a loss function $\mathbf{L}$ and noise at state $\theta$ spanned by
    222 $\{\nabla_\theta h_\theta(x_1) ,\ldots , \nabla_\theta h(x_n)\}$, loss
    223 stabilization occurs at some level set $\delta$ the authors of
    224 \cite{andriushchenko2023sgd} propose the following SDE model
    225 \begin{align*}
    226     d\theta_\tau = -\nabla_\theta \mathcal{L}(\theta_\tau)d\tau
    227     + \sqrt{\eta\delta}
    228     \phi_{\theta_\tau}\left(X\right)^{T}dB_\tau,
    229 \end{align*}
    230 where $(B_t)_{t\ge 0}$ is standard Brownian motion in $\mathbb{R}^{n}$ and
    231 $\phi_{\theta}(X) := [\nabla_{\theta}h_{\theta}(x_i)^{T}]_{i=1}^{n} \in
    232 \mathbb{R}^{n\times p}$ referred to as the Jacobian. This SDE can be
    233 interpreted as the effective slow dynamics that dives the iterates while they
    234 bounce rapidly in some directions at the level set $\delta$.
    235 
    236 
    237 \section{Sparse Feature Learning}
    238 In a two layer linear network with only diagonal connections, i.e. Diagonal
    239 Linear Network. With prediction function $h_{u, v}(x) = \langle u, v\odot
    240 x\rangle = \langle u \odot v, x\rangle$, where $\odot$ represents the
    241 elementwise multiplication. In this case the loss function is convex but the
    242 linear predictor $\beta:=u\odot v \in \mathbb{R}^{d}$ is not in $(u, v)$.
    243 Hence we can see from this example rich non-convex dynamics. Then $\nabla_u
    244 h_{u, v}(x) = v \odot x$  and the SDE model is
    245 \begin{align}
    246     du_t = -\nabla_u \mathcal{L}(u_t, v_t) dt + \sqrt{\eta\delta}\; v_t \odot
    247     [X^{T}dB_t],
    248 \end{align}
    249 where $(B_t)_{t\ge 0}$ is the standard Brownian motion in $\mathbb{R}^{n}$
    250 and the equations are symmetric for $(v_t)_{t\ge 0}$.
    251 
    252 The behavior of this effective dynamics shows from
    253 \cite{pillaudvivien2022label}
    254 that the linear predictor $\beta_t = u_t \odot v_t$
    255 \begin{enumerate}
    256     \item converges exponentially to zero outside of the support of $\beta^{*}$
    257     \item is with high probability in a $\mathcal{O}(\sqrt{\eta\delta})$
    258         neighborhood of $\beta^{*}$ in its support after time
    259         $\mathcal{O}( \delta^{-1})$.
    260 \end{enumerate}
    261 
    262 The first phase of SGD with large step sizes $\eta$ decreases the traning
    263 loss until stabilization at some level set $\delta > 0$, and during this
    264 loss-stabilization an effective dynamics takes place. Shrinking the
    265 coordinates outside of the support of the sparsest signal and oscillates in
    266 the parameter space at level $\mathcal{O}(\sqrt{\eta\delta})$ on its support.
    267 Thereby decreasing the step size later leads to perfect recovery of the
    268 sparsest predictor
    269 
    270 The diagonal linear nets show noisy dynamics which induce a sparcity bias,
    271 which is by \cite{andriushchenko2023sgd} due to the term $v_t \odot [X^{T}dB_t]$,
    272 this has a \textit{shrinking effect} on the coordinates (due to the element wise
    273 multiplication), i.e. entries collapse almost to zero.
    274 \newline
    275 
    276 In general from Equation \ref{eq: sde-sgd-dynamics} shows the same
    277 same behavior w.r.t. the Jacobian $\phi_\theta(X)$. This
    278 suggests that the implicit bias of the noise can lead to a shrinking age
    279 effect applied to $\phi_\theta(X)$, which depends on the noise intensity
    280 $\delta$ and step size of the SGD.
    281 
    282 Also note the property of the Brownian
    283 motion: $v \in \mathbb{R}^{p}$ then $\langle v, B_t\rangle = \|v\|_2 W_t$,
    284 where $(W_t)_{t\ge 0}$ is a one dimensional Brownian motion. Thereby the
    285 process in Equation \ref{eq: sde-sgd-dynamics} is equivalent to the process
    286 whose $i$-th coordinate is driven by a noise proportional to
    287 $\|\phi_i\|dW_{t}^{i}$. This SDE structure, similar to the geometric
    288 Brownian motion, is expected to induce the shrinking age of each
    289 multiplicative factor $(\|\nabla_\theta h(x_i)\|)_{i=1}^{n}$, hence the
    290 authors conjecture \cite{andriushchenko2023sgd}: \textit{The noise part of Equation
    291 \ref{eq: sde-sgd-dynamics} seeks to minimize the $l_2$-norm of the columns
    292 of $\phi_\theta(X)$.}
    293 \newline
    294 
    295 In summary the Jacobian can be simplified by collapsing because the SDE
    296 dynamics suggests that the noise part is trying to minimize the $l_2$ norm
    297 while the fitting part tries to fit the data preventing the Jacobian to
    298 collapse to zero. This suggests to collapse the Jacobian manually to zero
    299 below a certain threshold and to count the average number of
    300 \textbf{distinct} and \textbf{non-zero} activations over the training set,
    301 which is defined as the \textbf{feature sparsity coefficient}.
    302 
    303 \section{Empirical Evidence}
    304 In the follwing section the results for diagonal nets, deep nets, deep dense
    305 nets on CIFAR-10, CIFAR-100 and TinyImageNet are made. The common
    306 observations are
    307 \begin{enumerate}
    308     \item \textbf{Loss Stabilization:} Training loss stabilizes around a high
    309         level set untill step size is decayed,
    310     \item \textbf{Generalization benefit:} Longer loss stabilization leads
    311         to better generalization,
    312     \item \textbf{Sparese feature learning:} Longer loss stabilization
    313         leads to sparse features.
    314 \end{enumerate}
    315 In some cases a large steps size that would lead to loss stabilization could
    316 not be found, hence a \textit{warmup} step size is used which is explicitly
    317 mentioned. The \textit{warmup} step size schedule utilizes an increasing step
    318 size, according to some schedule, to make sure the loss stabilization occurs.
    319 \newline
    320 
    321 To measure the sparse feature learning feature both the rank of the Jacobian
    322 $\phi_{\theta}(X)$ and the feature sparsity is measured. Specifically the
    323 rank over iterations of each model is computed, except for deep networks
    324 because it this is computationally expensive. This is done by using a
    325 threshold on the singular values of $\phi_\theta(X)$ normalized by the
    326 largest singular eigenvalue. The reason for this is to ensure the difference
    327 in rank is not due to different scales of $\phi_\theta(X)$ at other
    328 iterations. The Jacobian $\phi_\theta(X)$ is computed on the fresh samples
    329 equal to the number of parameters $ |\theta |$, to make sure the rank
    330 deficiency is not coming from $n \ll \theta $.
    331 \newline
    332 
    333 As for the measurement of the feature sparsity coefficient, the average
    334 fraction of \textbf{distinct} and non-zero \textbf{activations} at some layer
    335 over the training set is being counted. Where a value of $100\%$ means a
    336 completely dense feature vector and a value of $0\%$ is a feature vector with
    337 all zeros. A pair of activations is counted as highly correlated if their
    338 Pearson's correlation coefficient is at lease $0.95$.
    339 
    340 \subsection{Diagonal Linear Networks}
    341 The setup for testing diagonal linear networks is the following. The inputs
    342 $(x_i)_{i=1}^{n}$ with $n=80$ are sampled from $\mathcal{N}(0,
    343 \mathbf{I}_d)$, where $\mathbf{I}_d$ is the identity matrix with $d=200$. The
    344 outputs are generated with $y_i = \langle \beta_* , x_i\rangle$ where
    345 $\beta_* \in \mathbb{R}^{d}$ is $r=20$ sparse. Four different SGD runs are
    346 considered one with a small step size and three with initial large step size
    347 decayed after  $10\%, 30\%$ and $50\%$ of iterations, respectively.
    348 
    349 \begin{figure}[H]
    350     \centering
    351     \includegraphics[width=0.8\textwidth]{./pics/dn_loss.png}
    352     \includegraphics[width=0.8\textwidth]{./pics/dn_sparsity.png}
    353     \includegraphics[width=\textwidth]{./pics/dn_setup.png}
    354     \caption{Diagonal Linear Nets Results \label{fig: diagonal}}
    355 \end{figure}
    356 
    357 Figure \ref{fig: digonal} shows the results stated in the previous chapters.
    358 In all three cases of large step size the loss stabilizes and the feature
    359 sparsity progress can be seen on the rank of the Jacobian and feature
    360 sparsity coefficient.
    361 
    362 
    363 
    364 \subsection{Simple ReLU Networks}
    365 \textbf{Two Layer ReLU networks in 1D.}
    366 Considered is a one dimensional regression task with $12$ points. A ReLU
    367 network with 100 neurons with SGD with a long linear warmup followed by a
    368 step size decay at $2\%$ and $50\%$ of iterations respectively. Loss
    369 stabilization is observed at around $10^{-5}$, the predictor becomes simpler
    370 and is expected to generalize better and both the rank and the feature
    371 sparsity coefficient decrease during loss stabilization. Because of the
    372 low dimensionality of the example it can be directly observed that the final
    373 predictor is sparse in terms of the number of distinct ReLU kinks \ref{fig:
    374 relu2}.
    375 
    376 \begin{figure}[H]
    377     \centering
    378     \includegraphics[width=0.8\textwidth]{./pics/relu2_loss.png}
    379     \includegraphics[width=0.8\textwidth]{./pics/relu2_sparsity.png}
    380     \includegraphics[width=\textwidth]{./pics/relu2_setup.png}
    381     \caption{\textbf{Two Layer} ReLU: 1D regression task \label{fig: relu2}}
    382 \end{figure}
    383 
    384 
    385 \textbf{Three Layer ReLU networks in 1D.}
    386 For deeper ReLU networks a teacher-student setup with a random three layer
    387 teacher ReLU network with 2 neurons at each hidden layer is used. The student
    388 network has 10 neurons on each layer and is trained on $50$ samples. This
    389 kind of setup is useful because it is known that the student network can
    390 implement the ground truth predictor but might not find it due to the small
    391 sample size. The model is trained using SGD with a medium constant step size
    392 and on contrast with a large step size with warmup decayed after $10\%, 30\%$
    393 an $50\%$ of iterations, respectively.
    394 
    395 \begin{figure}[H]
    396     \centering
    397     \includegraphics[width=0.8\textwidth]{./pics/relu3_loss.png}
    398     \includegraphics[width=\textwidth]{./pics/relu3_sparsity.png}
    399     \includegraphics[width=\textwidth]{./pics/relu3_setup.png}
    400     \caption{\textbf{Three Layer} ReLU: teacher-student setup \label{ref: relu3}}
    401 \end{figure}
    402 
    403 \subsection{Deep ReLU Networks}
    404 In this section a state of the art example is considered to show the sparse
    405 feature learning. Specifically considered is an image classification task,
    406 where the DenseNet-100-12 is trained on CIFAR-10, CIFAR-100 and TinyImageNet
    407 using SGD with batch size $256$ and different step size schedules. An
    408 exponentially increasing step size schedule is used, with exponent $1.05$ to
    409 establish loss stabilization. The rank of the Jacobian cannot be measured
    410 because of too large matrix dimensions, hence only the feature sparsity
    411 coefficient is taken at two layers: end of super block 3 (middle of network)
    412 and super block-block 4 (before average polling at the end of the network) of
    413 DenseNets. Two cases are tested one basic setting and a state of the art
    414 setting with momentum and standard augmentation. In all of the cases the same
    415 observations are made as for the much simpler toy models, i.e. shallow ReLU
    416 networks and diagonal linear nets \ref{fig: deep}
    417 
    418 \begin{figure}[H]
    419     \centering
    420     \includegraphics[width=0.8\textwidth]{./pics/densen_tiny_loss.png}
    421     \includegraphics[width=0.8\textwidth]{./pics/densen_tiny_sparsity.png}
    422     \includegraphics[width=\textwidth]{./pics/densen_tiny_setup.png}
    423     \caption{\textbf{DenseNet-100} trained on \textbf{ImageNet}
    424     (Image classification Task) \label{fig: deep}}
    425 \end{figure}
    426 
    427 
    428 \nocite{andriushchenko2023sgd}
    429 \nocite{shalev2014understanding}
    430 \nocite{pillaudvivien2022label}
    431 \printbibliography
    432 
    433 \end{document}