notes

uni notes
git clone git://popovic.xyz/notes.git
Log | Files | Refs

main.tex (39984B)


      1 \include{./preamble.tex}
      2 
      3 
      4 \begin{document}
      5 
      6 \maketitle
      7 
      8 \tableofcontents
      9 
     10 \section{Intro}
     11 The talk is about the paper \cite{scherzer2023newton}, in which the authors
     12 consider using a modified version of the Newton algorithm to reconstruct the
     13 coefficients of neural network coders in inverse problems. Instead of using
     14 the inverse of the Jacobian in the iteration, the authors use a more general
     15 Moore-Penrose inverse. It can be proven that under conditions this modified
     16 form of the Newton iteration, calling it the Gauss-Newton iteration converges
     17 locally, quadratically. The aim is to use these results on specifically
     18 shallow neural networks, because they satisfy an immersion property, i.e. are
     19 Lipschitz-differentiable immersions, which is the requisite of the
     20 convergence of the Gauss-Newton method. Additionally, presented results clearly show a
     21 quicker and reliable solution with its comparison, the Landweber method.
     22 These results were also checked by me, in terms of writing the code and using
     23 the same initial setup to come to the same conclusions \cite{code}.
     24 
     25 \subsection{Posing the problem}
     26 Consider linear operator equation between Hilbert spaces $\mathbf{X}$ and
     27 $\mathbf{Y}$
     28 \begin{align}
     29     F\mathbf{x} = \mathbf{y}.
     30 \end{align}
     31 For the problem modeling a function is introduced, called \textbf{Coding}
     32 $\Psi: \vec{P} \to \mathbf{X}$ which maps NN parameters to images functions,
     33 a nonlinear operator. The problem equation can be written as follows
     34 \begin{align}
     35     N(\vec{p}) = F\Psi(\vec{p}) = \mathbf{y}, \label{eq: main}
     36 \end{align}
     37 where $\mathbf{X}$ is the image space, $\mathbf{Y}$ the data space and $\vec{P}$ the parameter
     38 space. In the case the operator in question, $F$ is nonlinear, then we would of
     39 course have a nonlinear equation, which we are not considering right now.
     40 
     41 \subsection{Decomposition cases (review)}
     42 An operator $N$ satisfies the \textit{1st decomposition case} in an open empty
     43 neighborhood $\mathcal{B}\left(\vec{p}\;^{\dagger}; \rho \right) \subseteq
     44 \vec{P} $ (an open ball at point $\vec{p}\;^{\dagger}$ with radius $\rho$), if
     45 there exists a linear operator $F:\vec{P}\to \mathbf{X}$ and a nonlinear
     46 operator $\Psi:\mathbf{X} \to \mathbf{Y}$ such that.
     47 \begin{align}
     48     N(\vec{p}) = \Psi(F\vec{p}).
     49 \end{align}
     50 The \textit{2nd decomposition case} for operator $N$ in the same setting is
     51 satisfied, if there exists a linear operator $F: \mathbf{X} \to \mathbf{Y}$
     52 and a nonlinear operator $\Psi: \vec{P} \to \mathbf{X}$ such that
     53 \begin{align}
     54     N(\vec{p}) = F\Psi(\vec{p}).
     55 \end{align}
     56 
     57 \subsection{Gauss-Newton type method for 2nd decomposition case}
     58 The main object is the operator $\Psi:\mathcal{D} \subseteq \vec{P} :=
     59 \mathbb{R}^{n_*} \to \mathbf{X}$. The derivative of $\Psi$ is generally not
     60 invertible.
     61 \begin{align}
     62     N(\vec{p}) = F\Psi(\vec{p}).
     63 \end{align}
     64 The aim is to use the modified version of the Newton method to reconstruct
     65 the parameters $\vec{p}$. The prerequisite for this is some background on the
     66 general local convergence of the Newton Method, conditions called
     67 Newton-Mysovskii and the definition of the Moore-Penrose inverse.
     68 \section{Background}
     69 \subsection{Newton-Mysovskii}
     70 The local convergence of the Newton method is guaranteed under the so called Newton-Mysovskii
     71  conditions. In this section the results are summarized for the simple case in the
     72  finite dimensional space, when the nonlinear operator has derivative which
     73  are invertible. This result is going to be extended as aim of the summary.
     74  \begin{theorem}
     75      Let $N: \mathcal{D}(N) \subseteq \mathbb{R}^{n}\to \mathbb{R}^{n}$ be
     76      continuously Fr\'echet differentiable on a non-empty, open and convex
     77      set $\mathcal{D}\left( N \right) $. Let $\vec{p}\;^{\dagger} \in
     78      \mathcal{D}(N)$ be the solution of $N(\vec{p}\;) = \mathbf{y}$, where
     79      $\mathbf{y} \in \mathbb{R}^{n}$. Also assume that
     80      \begin{enumerate}
     81          \item $N'(\vec{p}\;)$ is invertible $\forall \vec{p} \in
     82              \mathcal{D}(n)$ and
     83          \item The Newton-Mysovskii condition hold, i.e. $\exists C_N > 0:$
     84             \begin{align}
     85                 &\big\| N'(\vec{p}\;)^{-1}\left( N'(\vec{q} + s(\vec{p} -
     86                 \vec{q}\;) - N'(\vec{q}\;)\right) (\vec{p} - \vec{q})
     87                 \big\|_{\vec{P}}
     88                 \leq s C_N \|\vec{p} - \vec{q} \;\|^{2}_{\vec{P}}\\
     89                 & \forall \vec{p}, \vec{q} \in \mathcal{D}(N), \; s \in[0,
     90                 1],
     91             \end{align}
     92      \end{enumerate}
     93      Now let $\vec{p}\;^{0} \in \mathcal{D}(N)$ the starting point of the
     94      Newton iteration, which should be sufficiently close to the solution,
     95      i.e. satisfying
     96      \begin{align}
     97          &\overline{\mathcal{B}\left(\vec{p}\;^{0}, \rho \right)}\subseteq
     98          \mathcal{D}(N) \qquad \text{with}\\
     99          &\rho := \|\vec{p}\;^{\dagger} - \vec{p}^{0}\|_{\vec{P}} \quad
    100          \text{and} \quad h:= \frac{\rho C_l C_L}{2} <1. \label{eq: locality}
    101      \end{align}
    102      Then the Newton iteration with starting point $\vec{p}\;^{0}$ and
    103      iterates $\left\{\vec{p}\;^{k}  \right\}_{k \in \mathbb{N}_0} \subseteq
    104      \overline{\mathcal{B}(\vec{p}\;^{0}, \rho)} $ of the
    105      form
    106      \begin{align}
    107          \vec{p}\;^{k+1} = \vec{p}\;^{k} - N'(\vec{p}\;^{k})^{-1}\left(
    108          N\left(\vec{p}\;^{k}  \right) - \mathbf{y} \right)  \qquad k \in
    109          \mathbb{N}_0,
    110      \end{align}
    111      converge to $\vec{p}\;^{\dagger} \in \overline{\mathcal{B}(\vec{p}\;^{0},
    112      \rho)}$, \textbf{quadratically}.
    113  \end{theorem}
    114 
    115 \subsection{Moore-Penrose Inverse}
    116 Now, the case where $\mathbf{Y}$  is an infinite dimensional Hilbert
    117 space, is introduced. It is necessary to replace the inverse in the classical
    118 Newton method because the liberalizations of the operator $N$, which
    119 generally cannot be invertible. This is done by introducing the so called
    120 Moore-Penrose inverse or more general the outer inverse and we refer to the
    121 Gauss-Newton method to distinguish between the classical version.
    122 \begin{mydef}{Inner, outer and Moore Penrose inverse
    123         \label{def: moore-penrose}}
    124     $L: \vec{P} \to \mathbf{Y}$ be a linear and bounded operator between
    125     vector spaces $\vec{P}$ and $\mathbf{X}$. Then
    126     \begin{enumerate}
    127         \item $B: \mathbf{Y} \to \vec{P}$ is called \textbf{left-inverse} to
    128             $L$ if
    129             \begin{align}
    130                 BL = I
    131             \end{align}
    132         \item $B: \mathbf{Y} \to \vec{P}$ is called \textbf{right-inverse} to
    133             $L$ if
    134             \begin{align}
    135                 LB = I
    136             \end{align}
    137         \item $B: \vec{P} \to \vec{P}$ is called an \textbf{inverse} to
    138             $L$ if it is both a left and a right inverse to $L$.
    139         \item $B: \vec{P} \to \vec{P}$ is called an \textbf{outer inverse} to
    140             $L$ if
    141             \begin{align}
    142                 BLB = B
    143             \end{align}
    144         \item Let $\vec{P}$ and $\mathbf{Y}$ be Hilbert spaces, $L: \vec{P}
    145             \to \mathbf{Y}$ be linear bounded operator. Denote the
    146             orthogonal projections $P$ and $Q$ onto the nullspace of $L$,
    147             $\mathcal{N}(L)$ closed and the closure of the range of $L$,
    148             $\overline{\mathcal{R}\left(L  \right)} $. This means that for all $\vec{p}
    149             \in \vec{P}$ and $\mathbf{y} \in \mathbf{Y}$ we have
    150             \begin{align}
    151                 &P\vec{p} = \text{argmin}
    152                 \left\{
    153                     \|\vec{p}_1-\vec{p}\|_{\vec{P}} : \vec{p}_1 \in
    154                 \mathcal{N}(L) \right\},\\
    155                 &Q\mathbf{y} = \text{argmin}
    156                 \left\{
    157                     \|\mathbf{y}_1 - \mathbf{y}\|_\mathbf{Y}: \mathbf{y} \in
    158                     \overline{\mathcal{R}(L)} \right\}
    159             \end{align}
    160             Then the operator $B: \mathcal{D}(B) \subseteq \mathcal{Y} \to
    161             \vec{P}$ with $\mathcal{B}(B):= \mathcal{R} \dotplus
    162             \mathcal{R}^{\perp}$ is called \textbf{Moore-Penrose inverse} of
    163             $L$ if the following conditions(identities) hold
    164             \begin{align}
    165                 &LBL = L, \nonumber\\
    166                 &BLB = B, \nonumber\\
    167                 &BL= I-P, \label{eq: moore-penrose}\\
    168                 &LB = Q|_{\mathcal{D}(B)} \nonumber.
    169             \end{align}
    170 
    171     \end{enumerate}
    172     The left and right inverses are used in a different context. For a left
    173     inverse the nullspace of $L$ has to be trivial, in contrast to $B$.
    174     On the other hand for the right inverse the nullspace of $B$ needs to be
    175     trivial.
    176 
    177 
    178 \end{mydef}
    179 
    180 \subsection{Lipschitz-differentiable immersion}
    181 \begin{mydef}{Lipschitz-differentiable immersion}
    182     Let there be $n_* = N*(n+2)$ neural nets depending on the parameters
    183     $(\vec{\alpha}, \mathbf{w}, \vec{\theta})$. Let $\Psi'$ be
    184     Lipschitz-continuous and
    185     \begin{align}
    186         \mathbf{X}_{\vec{p}} =
    187         \text{span}\{\partial_{p_i}\Psi(\vec{p})\;:\;i=1,\ldots,n_*\},
    188         \label{eq: lpdi-property}
    189     \end{align}
    190     has $\text{rank}(n_*)$.
    191     And let $\Psi'(\vec{p})^{\dagger}$ denote a generalized inverse,
    192     which replaces the standard $\Psi^{-1}$ in the standard Newton's method.
    193     TODO: more in detail definition.
    194 \end{mydef}
    195 Linking the inverse of the Lipschitz Lipschitz-differentiable immersion with
    196 the Moore-Penrose inverse together with the following results delivers the
    197 Gauss-Newton method
    198 \begin{theorem}
    199     \label{thm: moore-penrose}
    200     \begin{enumerate}
    201         \item The function $\Psi'(\vec{p}\;)^{\dagger}: \mathbf{X} \to \vec{P}$
    202             is the Moore-Penrose inverse of $\Psi'(\vec{p}\;)$.
    203         \item For all $\vec{p} \in \mathcal{D}(\Psi) \subseteq \vec{P}$
    204             there is a non-empty closed neighborhood where
    205             $\Psi'(\vec{p})^{\dagger}$ is uniformly bounded and it is
    206             Lipschitz-continuous, i.e.
    207             \begin{align}
    208                 &\|\Psi'(\vec{p}\;)^{\dagger} -
    209                 \Psi'(\vec{q}\;)^{\dagger}\|_{\mathbf{X}\to\vec{P}}
    210                 \leq C_L \|\vec{p} - \vec{q}\;\|_{\vec{P}}&\\
    211                 &\|\Psi'(\vec{p}\;)^{\dagger}
    212                 \|_{\mathbf{X}\to\vec{P}} \leq C_l\qquad &\text{for}\;\;
    213                 \vec{p}, \vec{q}\in \mathcal{D}(\Psi).
    214             \end{align}
    215     \item The operator $P_{\vec{p}}: \mathbf{X} \to \mathbf{X}_{\vec{p}}$ is
    216             bounded
    217     \end{enumerate}
    218 \end{theorem}
    219 \begin{proof}
    220     The proof can be found in the appendix \ref{proof: thm-moore-penrose}
    221 \end{proof}
    222 
    223 We can now wrap the results back to the original problem of the Gauss-Newton
    224 method \ref{eq: main}
    225 \begin{lemma}
    226     \label{lem: moore-penrose}
    227     Let $F$ be as in \ref{eq: main} linear, bounded with trivial nullspace
    228     and dense range (for the Moore-Penrose inverse). Let $\Psi:
    229     \mathcal{D}(\Psi)\subseteq \mathbb{R}^{n_*} \to \mathbf{X}$ be a
    230     Lipschitz-differentiable immersion. And $N = F\circ \Psi$ \ref{eq: main}.
    231     Here it is important to see the immanent result that for $N$,
    232     $\mathcal{D}(N) = \mathcal{D}(\Psi)$, therefore $\forall \vec{p} \in
    233     \mathcal{D}$ the derivative of the operator $N$ has a Moore-Penrose
    234     inverse $N'(\vec{p}\;)^{\dagger}$, satisfying
    235     \begin{enumerate}
    236         \item The decomposition property of the Moore-Penrose inverse
    237             \begin{align}
    238                 N'(\vec{p}\;)^{\dagger}\mathbf{z} =
    239                 \Psi'(\vec{p}\;)^{\dagger}F^{-1}\mathbf{z} \qquad \forall
    240                 \vec{p} \in \mathcal{D}(N),\; \mathbf{z} \in \mathcal{R}(F)
    241                 \subseteq \mathbf{Y}
    242             \end{align}
    243             which means that
    244             \begin{align}
    245                 &N'(\vec{p}\;)^{\dagger}N'(\vec{p}\;) = I \quad \text{on} \;\;
    246                 \mathbb{R}^{n_*} \qquad \text{and}\\
    247                 &N(\vec{p}\;)N'(\vec{p}\;)^{\dagger} =
    248                 \mathcal{Q}|_{\mathcal{R}(FP_{\vec{p}})},
    249             \end{align}
    250             where $\mathcal{Q} : \mathbf{Y} \to
    251             \overline{\mathcal{R}(FP_{\vec{p}})}\dotplus\mathcal{R}(FP_{\vec{p}})^{\perp}$.
    252             So in the definition of the Moore-Penrose
    253             \ref{def: moore-penrose},
    254             $P$ in \ref{eq: moore-penrose} is $P \equiv 0$.
    255         \item The generalized Newton-Mysovskii conditions are also satisfied
    256             \begin{align}
    257                 &\big\| N'(\vec{p}\;)^{\dagger}\left( N'(\vec{q} + s(\vec{p} -
    258                 \vec{q}\;) - N'(\vec{q}\;)\right) (\vec{p} - \vec{q})
    259                 \big\|_{\vec{P}}
    260                 \leq s C_l C_L \|\vec{p} - \vec{q} \;\|^{2}_{\vec{P}}\\
    261                 & \forall \vec{p}, \vec{q} \in \mathcal{D}(N), \; s \in[0,
    262                 1],
    263             \end{align}
    264             where $C_l, C_L$ are the Lipschitz-constants.
    265     \end{enumerate}
    266 \end{lemma}
    267 \begin{proof}
    268     The proof can be found in the appendix \ref{proof: lem-moore-penrose}
    269 \end{proof}
    270 
    271 Bringing it all together to show the local convergence of the Gauss-Newton
    272 method, where $N = F\circ \Psi$ is a composition of a linear bounded operator
    273 and a Lipschitz-differentiable immersion.
    274 \begin{theorem}
    275     \label{thm: gauss-newton-convergence}
    276     Assume there exists a $\vec{p}\;^{\dagger} \in \mathcal{D}(\Psi)$ which
    277     satisfies
    278     \begin{align}
    279         N(\vec{p}\;^{\dagger}) = \mathbf{y},
    280     \end{align}
    281     and assume there exists a $\vec{p}\;^{0} \in \mathcal{D}(\Psi)$ as in
    282     \ref{eq: locality}, satisfying locality, Then the iterates
    283     $\{\vec{p}\;^{k}\}_{k\in \mathbb{N}_0}$  of the Gauss-Newton method of
    284     the form
    285      \begin{align}
    286          \vec{p}\;^{k+1} = \vec{p}\;^{k} - N'(\vec{p}\;^{k})^{\dagger}\left(
    287          N\left(\vec{p}\;^{k}  \right) - \mathbf{y} \right)  \qquad k \in
    288          \mathbb{N}_0, \label{eq: gauss-newton-iteration}
    289      \end{align}
    290      are well-defined in $\overline{\mathcal{B}\left(\vec{p}\;^{0}, \rho
    291      \right) }$ and converge quadratically to $\vec{p}\;^{\dagger}$.
    292 \end{theorem}
    293 \begin{proof}
    294     The proof can be found in the appendix \ref{proof: thm-gauss-newton-convergence}
    295 \end{proof}
    296 
    297 
    298 \subsection{Neural networks}
    299 Shallow neural network coders are of the following form
    300 \begin{align}
    301     \label{eq: shallow-nn}
    302     \Psi:
    303     \mathcal{D}(\Psi) := \mathbb{R}^{n_*} =
    304     \mathbb{R}^{N}\times \mathbb{R}^{n \times N}
    305     \times \mathbb{R}^{N}
    306     &\to \mathbf{X} :=
    307     L^{2}\left([0, 1]^{n}\right),\\
    308     \vec{p} = (\vec{\alpha}, \mathbf{w}, \vec{\theta}) &\mapsto
    309     \left(\vec{x} \to \sum_{j=1}^{N} \alpha_j\sigma\left(
    310     \vec{\mathbf{w}}_j^{T}\vec{x} + \theta_j \right)  \right),
    311 \end{align}q
    312 where $\sigma$ is an activation function, such as tanh or sigmoid.
    313 
    314 A standard deep neural network (DNN) with $L$ layers is a function depending on $\vec{x} \in
    315 \mathbb{R}^{n}$ with parameters $\vec{p}:=\left( \vec{\alpha}_l,
    316 \mathbf{w}_l, \vec{\theta}_l  \right)_{l=1}^{L}$
    317 \begin{align}
    318     \vec{x}\to\Psi(\vec{x}) := \sum_{j_L=1}^{N_L} \alpha_{j_L,L}\sigma_L\
    319     \left( p_{j_L, L} \left( \sum_{j_{L-1}=1}^{N_{L-1}}\cdots
    320     \left( \sum_{j_1=1}^{N_1}\alpha_{j_1,1}\sigma_1\left(\rho_{j_1,1}(\vec{x})
    321     \right)  \right)  \right)  \right),
    322 \end{align}
    323 where
    324 \begin{align}
    325     \rho_{j_l}(\vec{x}) = \mathbf{w}_{j, l}^{T}\vec{x} + \theta_{j,l},
    326 \end{align}
    327 with $\alpha_{j,l}, \theta_{j,l} \in \mathbb{R}$ and $\vec{x},
    328 \mathbf{w}_{j,l} \in \mathbb{R}^{n} \;\; \forall l=1,\ldots,L$. And is
    329 probably not a Lipschitz-continuous immersion!
    330 
    331 Then the inverse problem
    332 \begin{align}
    333     N(\vec{p}\;) = F\Psi(\vec{p}\;) = \mathbf{y},
    334 \end{align}
    335 can be solved using variational methods, e.g. Tikhonov regularization
    336 \begin{align}
    337     \|N(\vec{p}) - \mathbf{y}\|^{2} + \alpha \|\vec{p}\|^{2} \to \min,
    338 \end{align}
    339 or alternatively state space regularization
    340 \begin{align}
    341     \|N(\vec{p}) - \mathbf{y}\|^{2}
    342     + \alpha \|\mathbf{x} - \mathbf{x}_0\|^{2}
    343     \to \min \quad \text{s.t} \quad \Psi(\vec{p}) = \mathbf{x}.
    344 \end{align}
    345 Here the analysis the iterative method, the Gauss-Newton method is
    346 considered.
    347 \begin{align}
    348     \vec{p}\;^{k+1} = \vec{p}\;^{k} -
    349     N'( \vec{p}\;^{k})^{-1} (N(\vec{p}\;^{k}) -
    350     \mathbf{y}  ),
    351 \end{align}
    352 where $N'$ is the Jacobian. And we assume that the nonlinear operator $\Psi$ is well-posed.
    353 
    354 
    355 \section{Newton's method with the neural network operator}
    356 In this section the main results of the talk are explained. The aim is to use
    357 the universal approximation properties of neural networks, the fact that they
    358 can approximate continuous functions arbitrarily well, to the inverse problem
    359 in \ref{eq: main} using the Gauss-Newton method. To ensure convergence it is
    360 necessary to show that the considered neural network structure is a
    361 Lipschitz-differentiable immersion. As it will be summarized, a direct implication
    362 of this is to show that the activation function, its
    363 derivative and its first moment of the derivative are linearly independent.
    364 For this, results from \cite{lamperski_2022} are used and it is conjectured
    365 that the statement from \ref{eq: lpdi-property} is fulfilled, meaning that
    366 the functions forming $\mathbf{X}_{\vec{p}}$ are linearly independent.
    367 \newline
    368 Convergence is based on the immersion property of the network functions
    369 \begin{align}
    370     \text{span}\{\partial_{p_i}\Psi(\vec{p})\;:\;i=1,\ldots,n_*\}, \qquad
    371     \text{has rank}(n_*).
    372 \end{align}
    373 To show the Newton-Mysovskii conditions for neural network functions
    374 following notation is adapted
    375 \begin{align}
    376     \vec{p} := (\vec{\alpha}, \mathbf{w}, \vec{\theta}) \in
    377     \mathbb{R}^{N}\times \mathbb{R}^{n\cdot N} \times \mathbb{R}^{N} =
    378     \mathbb{R}^{n_*}.
    379 \end{align}
    380 
    381 For $\alpha_i \neq 0$, this, in particular, requires that the functions
    382 \begin{align}
    383     & \frac{\partial \Psi}{\partial \alpha_s} =\sigma(\rho),\quad
    384      \frac{\partial \Psi}{\partial w_s^{t}} =\sigma'(\rho)x_t,\quad
    385      \frac{\partial \Psi}{\partial \theta_s} =\sigma'(\rho),\\
    386     & \text{where} \qquad
    387     \rho = \sum_{i=1}^{n}w_s^{i}x_i + \theta_s, \\
    388 \end{align}
    389 assuming that the activation function is continuously differentiable,
    390 are \textbf{linearly independent} and that $\alpha_s \neq 0$ -
    391 \textbf{sparse} coefficients cannot be recovered.
    392 \subsection{Linear independence problem}
    393 The question is if
    394 \begin{align}
    395     \frac{\partial \Psi}{\partial \alpha_s} ,
    396     \frac{\partial \Psi}{\partial w_s^{\dagger}} ,
    397     \frac{\partial \Psi}{\partial \theta_s} \label{eq: linear_indep}
    398 \end{align}
    399 Partial answer for $\frac{\partial \Psi}{\partial \alpha_s} (\vec{x}) =
    400 \sigma\left( \sum_{i=1}^{n} w_s^{i}x_i + \theta_s \right)$ in the
    401 \cite{lamperski_2022} theorem:
    402 \begin{theorem}
    403     \label{thm: lamperski}
    404     For all activation functions \textit{HardShrink, HardSigmoid, HardTanh,
    405     HardSwish, LeakyReLU, PReLU, ReLU, ReLU6, RReLU, SoftShring, Threshold,
    406     LogSigmoid, Sigmoid, SoftPlus, Tanh and TanhShring and the PyTorch
    407     functions CELU, ELU, SELU} the shallow neural network functions formed by
    408     \textbf{randomly generated vectors} $(\mathbf{w}, \vec{\theta})$ are
    409     \textbf{linearly independent}.
    410 \end{theorem}
    411 But here it is also needed more that the first derivative of the sigmoid functions and
    412 the first moment of the first derivative together with the above result are
    413 linearly independent. But the answer is not satisfactory because its not
    414 known. More or less with a probability $1$ it can be proven that the functions
    415 above are linearly independent.
    416 
    417 For the sigmoid function we have some symmetries because
    418 \begin{align}
    419     \sigma'(\mathbf{w}^{T}_j \vec{x} + \theta_j)
    420     = \sigma'\left(-\mathbf{w}_j^{T}\vec{x} - \theta_j  \right)
    421 \end{align}
    422 or in another formulation
    423 \begin{align}
    424     \Psi'(\vec{\alpha}, \mathbf{w}, \vec{\theta}) = \Psi'(\vec{\alpha},
    425         -\mathbf{w}, -\vec{\theta})
    426 \end{align}
    427 Conjecture: obvious symmetries = "random set" from \cite{lamperski_2022}. It
    428 is conjectured that the above functions in \ref{eq: linear_indep} are
    429 linearly independent and the results are build from here on out.
    430 
    431 \begin{conjecture}
    432     \label{conj: main}
    433     Assume that the functions from \ref{eq: linear_indep} are locally
    434     linearly independent.
    435 \end{conjecture}
    436 From the above Conjecture it follows that the shallow network coders are a
    437 Lipschitz-differentiable immersion (for a suitable choice of an activation
    438 function), so Gauss-Newton method converges locally.
    439 \subsection{Gauss-Newton iteration with coding networks}
    440 A direct implication of the local convergence of the Gauss-Newton method is
    441 the immersion property. In this section the proof of the
    442 Lipschitz-differentiable immersion for the shallow neural network coders and
    443 the convergence of the Gauss-Newton method are going to be summarized.
    444 \begin{lemma}
    445     \label{lemma: immersion}
    446     Let the activation function $\sigma$ be one of the function from
    447     \ref{thm: lamperski} \cite{lamperski_2022}. Let $F:\mathbf{X}=L^{2}\left(
    448     [0, 1]^{n}\right) \to \mathbf{Y}$ be linear, bounded and with trivial
    449     nullspace and closed range. Assume the Conjecture \ref{conj: main}
    450     holds, then
    451     \begin{itemize}
    452         \item $\forall \vec{p} = (\vec{\alpha}, \mathbf{w}, \vec{\theta}) \in
    453             \mathbb{R}^{n_*} \subseteq \mathcal{D}(\Psi)$,
    454             $\mathcal{R}(D\Psi[\vec{p}\;])$ is a linear
    455             subspace of $\mathbf{X}$ of dimension $n_*$
    456         \item There exists an open neighborhood $\mathcal{U} \subseteq
    457             \mathbb{R}^{n_*}$ of vectors $\vec{p}$ s.t. $\Psi$ is a
    458             Lipschitz-differentiable immersion in $\mathcal{U}$.
    459     \end{itemize}
    460 \end{lemma}
    461 \begin{proof}
    462     The proof can be found in the appendix \ref{proof: lem-immersion}
    463 \end{proof}
    464 
    465 
    466 The finial results states the local convergence of the Gauss-Newton method
    467 for shallow network coders.
    468 \begin{theorem}
    469     Using the same setup and assumptions as above, additionally let $\vec{p}
    470     \in \mathbf{D}(\Psi)$ be a starting point of the Gauss-Newton iteration
    471     \ref{eq: gauss-newton-iteration} and $\vec{p}\;^{\dagger} \in
    472     \mathcal{D}(\Psi)$ the solution of equation \ref{eq: main}. Then the
    473     Gauss-Newton method converges quadratically if
    474     $\vec{p}\;^{0}$ is sufficiently close to $\vec{p}\;^{\dagger}$.
    475     \begin{align}
    476         \vec{p}\;^{k+1} = \vec{p}
    477     \end{align}
    478 \end{theorem}
    479 \begin{proof}
    480 
    481     Trivially applying Lemma \ref{lemma: immersion} to Theorem
    482     \ref{thm: gauss-newton-convergence}.
    483 \end{proof}
    484 
    485 
    486 
    487 
    488 \section{Results}
    489 In the following section the results presented in the paper
    490 \cite{scherzer2023newton} are rerun.
    491 \subsection{Numerical results(simplified)}
    492 The simplification is
    493 \begin{align}
    494     &N = F \circ \Psi \\
    495     &\mathbf{y}^{\dagger} = F\Psi(\vec{p}\;^{\dagger}) \qquad \text{is
    496     attainable}
    497 \end{align}
    498 The aim is to recostruct the coefficients of a 2d neural network function
    499 \ref{eq: shallow-nn} with
    500 \begin{align}
    501     \vec{p}\;^{\dagger} = \left(
    502         \begin{pmatrix} 1.0\\1.0\\0.1\\0.1 \end{pmatrix};
    503         \begin{pmatrix} 0.3\\0.1\\1.0\\0.8 \end{pmatrix} \right).
    504 \end{align}
    505 Meaning that it is aimed to reconstruct the following function
    506 \begin{align}
    507     \Psi^{\dagger}(\vec{x})
    508     &= \Psi(\vec{p}\;^{\dagger})(\vec{x})\\
    509     &=1.0\sigma\left( (1.0,0.1)^{T}\vec{x} + 0.1 \right)
    510     +0.3 \sigma\left( (0.1, 1.0)^{T}\vec{x} + 0.8 \right).
    511 \end{align}
    512 The stopping criterion is $\|\Psi(\vec{p}\;^{k_s+1}) - \Psi^{\dagger})\| \le
    513 \delta = 0.001$
    514 
    515 \begin{figure}[H]
    516     \centering
    517     \includegraphics[width=0.8\textwidth]{./pics/gn_coeff.png}
    518     \caption{Coefficients, left truth, middle starting point, right
    519     iteration stop}
    520     \label{fig: gn_coeff}
    521 \end{figure}
    522 The below figure shows the standard loss used for the breaking criterion and
    523 the residual loss
    524 \begin{align}
    525     \log \left( \frac{\|\Psi(\vec{p}\;^{k+1}) -
    526     \Psi^{\dagger}\|}{\|\Psi(\vec{p}\;^{k}) - \Psi^{\dagger}\|} \right)
    527 \end{align}
    528 \begin{figure}[H]
    529     \centering
    530     \includegraphics[width=0.8\textwidth]{./pics/gn_loss.png}
    531     \caption{Gauss-Newton loss, left standard norm loss, right residual loss}
    532     \label{fig: gn_loss}
    533 \end{figure}
    534 
    535 
    536 For comparison with the Gauss-Newton iteration, consider the Landweber iteration
    537 \begin{align}
    538     \vec{p}\;^{k+1} = \vec{p}\;^{k} - \lambda \Psi'\left(\vec{p}\;^{k}  \right)^{\dagger}
    539     \left( \Psi(\vec{p}\;^{k}) - \Psi^{\dagger} \right) \qquad k \in
    540     \mathbb{N}_0.
    541 \end{align}
    542 Around $8000$ iteration are needed to reach the same accuracy that the
    543 Gauss-Newton method reaches in $5$, with $\lambda = 0.02$ (e.g. $\lambda=0.2$
    544 diverges).
    545 \begin{figure}[H]
    546     \centering
    547     \includegraphics[width=0.8\textwidth]{./pics/lw_coeff.png}
    548     \caption{Coefficients, left truth, middle starting point, right
    549     iteration stop}
    550     \label{fig: lw_coeff}
    551 \end{figure}
    552 \begin{figure}[H]
    553     \centering
    554     \includegraphics[width=0.8\textwidth]{./pics/lw_loss.png}
    555     \caption{Landweber loss, left standard norm loss, right residual loss}
    556     \label{fig: lw_loss}
    557 \end{figure}
    558 
    559 If the observed convergence rate of the Gauss-Newton can change completely if the
    560 solution is not attainable. Then the conjecture is that the non-convergence
    561 because of multiple solutions, which can be observed when not using enough
    562 reconstruction data.
    563 Also the implementation of the simplified Gauss-Newton requires inversion of
    564 $F$ , which is not done in practice, this is not true for Landweber.
    565 \subsection{Alternative to DNNs}
    566 Instead of using Deep Neural Networks where it is unknown if
    567 the immersion is invertible, another considerable option is to use
    568 Quadratic neural network functions defined as follows
    569 \begin{align}
    570     \Psi(\vec{x}) := \sum_{j=1}^{N} \alpha_j\sigma\left(\vec{x}^{T}A_j\vec{x}
    571         + \mathbf{w}_j^{T}\vec{x} + \theta_j \right),
    572 \end{align}
    573 with $\alpha_j, \theta_j \in \mathbb{R}, \mathbf{w}_j \in \mathbb{R}^{n}$
    574 and $A_j \in \mathbb{R}^{n \times n}$. We can also constrain the class of
    575 $A_j$ and $\mathbf{w}_j$ which leads us to circular networks, circular
    576 affine, elliptic, parabolic...
    577 \begin{theorem}
    578     Quadratic neural network functions satisfy the universal approximation
    579     property.
    580 \end{theorem}
    581 The immersion property of circular network functions
    582 \begin{align}
    583     \Psi(\vec{x}) := \sum_{j=1}^{N} \alpha_j\sigma\left(r_j\vec{x}^{T}\vec{x}
    584         + \mathbf{w}_j^{T}\vec{x} + \theta_j \right),
    585 \end{align}
    586 and
    587 \begin{align}
    588     \text{span}\{\partial_{p_i}\Psi(\vec{p})\;:\;i=1,\ldots,n_*\}, \qquad
    589     \text{has rank}(n_*).
    590 \end{align}
    591 For $\alpha_i \neq 0$, this in particular requires that the functions
    592 \begin{align}
    593     \frac{\partial \Psi}{\partial r_s}  = \sigma\left( \rho \right)
    594     \vec{x}^{T}\vec{x}, \quad
    595     \frac{\partial \Psi}{\partial \alpha_s}  = \sigma\left( \rho \right)
    596     , \quad
    597     \frac{\partial \Psi}{\partial w_s^{t}}  = \sigma'\left( \rho \right)
    598     x_t,\quad
    599     \frac{\partial \Psi}{\partial \theta_s}  = \sigma'\left( \rho \right)
    600 \end{align}
    601 are \textbf{linearly independent}.
    602 \newline
    603 
    604 Results of these types of networks is that the Shepp-Logan can be represented
    605 with \textbf{10 nodes} with elliptic neurons and \textbf{one layer}. Where as
    606 for affine networks, both shallow and deep we need infinity neurons.
    607 
    608 \appendix
    609 \section{Proofs}
    610 The following section is dedicated to fill the proofs to the theorems and
    611 lemmas in the main text.
    612 \begin{myproof}{Theorem \ref{thm: moore-penrose}}
    613     \label{proof: thm-moore-penrose}
    614     \begin{enumerate}
    615         \item To show that $\Psi'(\vec{p}\;)^{\dagger}$ is indeed the
    616             Moore-Penrose inverse we need to verify the four identities given
    617             in \ref{eq: moore-penrose}. Aligning the notation in the
    618             difinition of the Moore-Penrose inverse then
    619             \begin{align}
    620                 &L = \Psi'(\vec{p}\;) : \vec{P}=\mathbb{R}^{n_*} \to
    621                 \mathbf{X}, \\
    622                 & B = \Psi'(\vec{p}\;)^{\dagger}: \mathbf{X} \to \vec{P}, \\
    623                 &\mathcal{D}(B) = \mathcal{D}(\Psi'(\vec{p}\;)^{\dagger}) =
    624                 \mathbf{X}, \\
    625                 &P: \vec{P} \to \vec{P} \qquad \text{the zero operator},\\
    626                 &Q = P_{\vec{p}}:\mathbf{X} \to \mathbf{X}_{\vec{p}}
    627                 \qquad \text{the projection operator to }
    628                 \mathbf{X}_{\vec{p}}.
    629             \end{align}
    630             For the third identity with $P = 0$, let $\vec{q} =
    631             (q_i)_{i=1}^{n_*} \in \vec{P}$ and we have
    632             \begin{align}
    633                 \Psi'(\vec{p}\;)^{\dagger}\Psi'(\vec{p}\;)\vec{q} &=
    634                 \Psi'(\vec{p}\;)^{-1}\left( \sum_{i=1}^{n_*} q_i
    635                 \partial_{p_i} \Psi(\vec{p}\;) \right) \\
    636               &=\left(q_i  \right)_{i=1}^{n_*}  \\
    637               &= \vec{q}.
    638             \end{align}
    639             The fourth identity we use the fact that $(\partial_{p}
    640             \Psi(\vec{p}\;))_{i=1}^{n_*}$ is a basis, so for all $\mathbf{x}
    641             = (\mathbf{x_1}, \mathbf{x_2}) \in \mathbf{X}$ there exists an
    642             $x_i$, $i \in \{1,\ldots,n_*\}$ such that we can represent any
    643             $\mathbf{x}$ through
    644             \begin{align}
    645                 \mathbf{x} = \sum_{i=1}^{n_*}
    646                 x_i\partial_{p_i}\Psi(\vec{p}\;) + \mathbf{x_2},
    647             \end{align}
    648         with $\mathbf{x}_2 \in \mathbf{X}^{\perp}_{\vec{p}}$. Meaning that
    649         \begin{align}
    650             P_{\vec{p}}\;\mathbf{x} = \sum_{i=1}^{n_*} x_i\partial_{p_i}
    651             \Psi'(\vec{p}\;) \qquad \text{and},\\
    652             \Psi'(\vec{p}\;)^{\dagger}\mathbf{x} = (x_i)_{i=1}^{n_*}=\vec{x}.
    653         \end{align}
    654         Then the identity falls down to
    655         \begin{align}
    656             \Psi'(\vec{p}\;)\Psi'(\vec{p}\;)^{\dagger}\mathbf{x} =
    657             \Psi'(\vec{p}\;)\vec{x} = P_{\vec{p}}\;\vec{x}
    658         \end{align}
    659         For the second identity, use the results from the first and the
    660         fourth identity then fact that $\forall \mathbf{x} \in
    661         \mathbf{X}$:
    662         \begin{align}
    663             \Psi'(\vec{p}\;)^{\dagger}\Psi'(\vec{p}\;)
    664             \Psi'(\vec{p}\;)^{\dagger}\mathbf{x}
    665             &= \Psi'(\vec{p}\;)^{\dagger}
    666             \Psi'(\vec{p}\;)\vec{x} \\
    667             &= \vec{x}\\
    668             &= \Psi(\vec{p}\;)^{\dagger} \mathbf{x}.
    669         \end{align}
    670         For the first identity using the previous results $\forall \vec{q}
    671         \in \vec{P}$:
    672         \begin{align}
    673             \Psi'(\vec{p}\;)
    674             \Psi'(\vec{p}\;)^{\dagger}
    675             \Psi'(\vec{p}\;)
    676             &= \Psi'(\vec{p}\;) \vec{q}\\
    677             (&= L).
    678         \end{align}
    679 
    680         \item For the boundary constraint
    681     \end{enumerate}
    682 \end{myproof}
    683 
    684 \begin{myproof}{Lemma \ref{lem: moore-penrose}}
    685     \label{proof: lem-moore-penrose}
    686     \begin{enumerate}
    687         \item
    688             First of all by the chain rule it holds that
    689             \begin{align}
    690                 N'(\vec{p}\;) = F\Psi'(\vec{p}\;) \qquad
    691                 \forall \vec{p} \in \mathcal{D}(\Psi) = \mathcal{D}(N).
    692             \end{align}
    693             To prove the decomposition property of the Moore-Penrose
    694             inverse it is necessary to verify the equations
    695             \ref{eq: moore-penrose} defining it. With the notation
    696             \begin{align}
    697                 &L:= N'(\vec{p}\;) = F\Psi'(\vec{p}\;): \vec{P} \to
    698                 \mathbf{Y} \\
    699                 &B:= \Psi'(\vec{p}\;)^{\dagger}F^{-1} : \mathcal{R}(F)
    700                 \subseteq \mathbf{Y} \to \vec{P}.
    701             \end{align}
    702             Also note that by assumption $F$ has a dense range and consider
    703             $B$ on $\mathcal{R}(F) \dotplus
    704             \underbrace{\mathcal{R}(F)^{\perp}}_{=\{0\}}$. Thereby from for
    705             a fixed $\vec{p}$ the notation of \ref{eq: moore-penrose} is
    706             \begin{align}
    707                 &\mathcal{D}(B) =
    708                 \mathcal{D}(\Psi'(\vec{p}\;)^{\dagger}F^{-1}) =
    709                 \mathcal{R}(F),\\
    710                 &\mathcal{R}(L) = \big\{F\Psi'(\vec{p}\;)\vec{q}\;:\;\vec{q}
    711                 \in \mathbb{R}^{n_*} \big\} = \mathcal{R}(FP_{\vec{p}}).
    712             \end{align}
    713             Note that $Q$ is an orthogonal projection onto
    714             $\overline{\mathcal{R}(L)}$, i.e.
    715             $\overline{\mathcal{R}(FP_{\vec{p}})}$, then for
    716             \begin{align}
    717                 \mathbf{z} &= F\mathbf{x} \\
    718                     &= FP_{\vec{p}}\;
    719                     \mathbf{x} + F\left(I-P_{\vec{p}} \right) \mathbf{x}
    720             \end{align}
    721             we have that
    722             \begin{align}
    723                 Q\mathbf{z} &= Q\left(FP_{\vec{p}}\;\mathbf{x}
    724                    + F(I-P_{\vec{p}})(\mathbf{x})\right) \\
    725                 &= FP_{\vec{p}}\; \mathbf{x}.
    726             \end{align}
    727             Finally applying Lemma \ref{thm: moore-penrose} and the
    728             invertability of $F$ on the range of $F$ shows that
    729             \begin{align}
    730                 LBL &= F\Psi'(\vec{p}\;)\Psi'(\vec{p}\;)^{\dagger}
    731                 F^{-1}F\Psi'(\vec{p}\;) \\
    732                     &=
    733                     F\Psi'(\vec{p}\;)
    734                     \Psi'(\vec{p}\;)^{\dagger}\Psi'(\vec{p}\;) \nonumber\\
    735                     &= F \Psi'(\vec{p}\;) \nonumber \\
    736                     &= L \nonumber \\
    737                     \nonumber \\
    738                 BLB &= \Psi'(\vec{p}\;)^{\dagger}F^{-1}F\Psi'(\vec{p}\;)
    739                 \Psi'(\vec{p}\;)^{\dagger}F^{-1} \\
    740                     &=
    741                     \Psi'(\vec{p}\;)^{\dagger}
    742                     \Psi'(\vec{p}\;)\Psi'(\vec{p}\;)^{\dagger}F^{-1} \nonumber\\
    743                     &= \Psi'(\vec{p}\;)^{\dagger}F^{-1} \nonumber \\
    744                     &= B \nonumber \\
    745                     \nonumber \\
    746                 BL &= \Psi'(\vec{p}\;)^{\dagger}F^{-1}F\Psi'(\vec{p}\;) \\
    747                    &= I - P \\
    748                    &= I \quad \text{on }  \mathbb{R}^{n_*}.
    749             \end{align}
    750             The fourth identity is proven by taking a $\mathbf{z} \in
    751             \mathcal{R}(F)$, so there exists a $\mathbf{x} \in \mathbf{X}$
    752             such that $F\mathbf{x} = \mathbf{z}$, then it follows
    753             \begin{align}
    754                 LB\mathbf{z} &=
    755                 F\Psi'(\vec{p}\;)\Psi'(\vec{p}\;)^{\dagger}
    756                 F^{-1}\mathbf{z} \\
    757                  &= F\Psi'(\vec{p}\;)\Psi'(\vec{p}\;)^{\dagger}
    758                      \mathbf{x} \\
    759                 &= FP_{\vec{p}}\mathbf{x} \\
    760                 &= Q\mathbf{z}.
    761             \end{align}
    762         \item As for the generalized Newton-Mysovskii conditions, the
    763             operator $\Psi'(\vec{p})$ is locally bounded and locally
    764             Lipschitz continuous on $\mathcal{D}(\Psi)$, meaning that there are
    765             constants $C_L, C_l > 0$ such that $\forall \vec{p}, \vec{q} \in
    766             \mathcal{D}(\Psi)$ the following inequalities hold
    767             \begin{align}
    768                 &\big\|\Psi'(\vec{p}\;) - \Psi'(\vec{q}\;)
    769                 \big\|_{\vec{P}\to\mathbf{X}} \leq C_L \|\vec{p} - \vec{q} \|
    770                 \\
    771                 &\big\|\Psi'(\vec{p}) \big\|_{\vec{P} \to \mathbf{X}} \le
    772                 C_l.
    773             \end{align}
    774             Thus
    775             \begin{align}
    776                 &\Big\| N'(\vec{p}\;)^{\dagger}\left( N'(\vec{q} + s(\vec{p} -
    777                 \vec{q}\;) - N'(\vec{q}\;)\right)
    778                 (\vec{p} - \vec{q}) \Big\|_{\vec{P}} \\
    779                 &= \Big\|
    780                 \Psi'(\vec{p}\;)^{\dagger}F^{-1}\left( F\Psi'(\vec{q}\;
    781                 + s\left( \vec{p} - \vec{q}\; \right)-F\Psi'(\vec{q}\;)
    782             \right)) \left(\vec{p} - \vec{q}  \right) \Big\|_{\vec{P}}
    783             \nonumber \\
    784                 &= \Big\|
    785                 \Psi'(\vec{p}\;)^{\dagger}\left(\Psi'(\vec{q}\;
    786                 + s\left( \vec{p} - \vec{q}\; \right)-F\Psi'(\vec{q}\;)
    787             \right)) \left(\vec{p} - \vec{q}  \right) \Big\|_{\vec{P}}
    788             \nonumber\\
    789                 &\leq C_lC_L s \|\vec{p} - \vec{q} \|^{2}_{\vec{P}},
    790                 \nonumber
    791             \end{align}
    792         for all $\vec{p}, \vec{q} \in \mathcal{D}(\Psi) = \mathcal{D}(N)$.
    793 
    794     \end{enumerate}
    795 
    796 \end{myproof}
    797 
    798 \begin{myproof}{Theorem \ref{thm: gauss-newton-convergence}}
    799     \label{proof: thm-gauss-newton-convergence}
    800     The distance to the solution and the first iterate is $\rho =
    801     \|\vec{p}\;^{\dagger} - \vec{p}\;^{0}\|$. Also $\mathcal{D}(\Psi) =
    802     \mathcal{D}(N)$ since $F$ is defined all over $\mathcal{X}$. First of all
    803     we have to prove that all iterates lie in the closed ball, i.e.
    804     $\vec{p}\;^{k} \in \overline{\mathcal{B}(\vec{p}\;^{\dagger}; \rho)}$.,
    805     this is done by induction. The case $k=0$ is fulfilled by the existence assumption.
    806     For $\vec{p}\;^{k}$ use the equations \ref{eq: moore-penrose}
    807     \begin{align}
    808         &N'(\vec{p}\;^{k})N'(\vec{p}\;^{k})^{\dagger}N'(\vec{p}\;^{k})
    809         \left(\vec{p}\;^{k+1} - \vec{p}\;^{\dagger}  \right)\\
    810         &= N'(\vec{p}\;^{k})(\vec{p}\;^{k+1} - \vec{p}\;^{k}).
    811     \end{align}
    812     Then the definition of the Gauss-Newton method
    813     \ref{eq: gauss-newton-iteration} implies
    814     \begin{align}
    815         &N'(\vec{p}\;^{k})(\vec{p}\;^{k+1}-\vec{p}\;^{\dagger}) =\\
    816         &=N'(\vec{p}\;^{k})N'(\vec{p}\;^{k})^{\dagger}\left(
    817         N(\vec{p}\;^{\dagger})-N(\vec{p}\;^{k})
    818         - N'(\vec{p}\;^{k})(\vec{p}\;^{\dagger} - \vec{p}\;^{k})\right) .
    819     \end{align}
    820     Now using the third identity in \ref{eq: moore-penrose}, by the assumption
    821     of this theorem $P = 0$ and using the second identity in
    822     \ref{eq: moore-penrose} with the fact that $F$ is injective, it follows
    823     that
    824     \begin{align}
    825         \vec{p}\;^{k+1} - \vec{p}\;^{\dagger}
    826         &=
    827         N'(\vec{p}\;^{k})^{\dagger}N'(\vec{p}\;^{k})(\vec{p}\;^{k+1} -
    828         \vec{p}^{\dagger}) \\
    829         &=
    830         N'(\vec{p}\;^{k})^{\dagger}\left(N(\vec{p}\;^{\dagger})-N(\vec{p}\;^{k})
    831         - N'(\vec{p}\;^{k})(\vec{p}\;^{\dagger} - \vec{p}\;^{k}) \right) \\
    832         &= \Psi'(\vec{p}\;^{k})^{\dagger}\left(\Psi(\vec{p}\;^{\dagger}-
    833         \Psi(\vec{p}\;^{k}) -
    834     \Psi'(\vec{p}\;^{k})(\vec{p}\;^{\dagger}-\vec{p}\;^{k}) \right).
    835     \end{align}
    836     Finally the Newton-Mysovskii conditions give use
    837     \begin{align}
    838         \|\vec{p}\;^{k+1} - \vec{p}\;^{\dagger}\|
    839         &\le \frac{C_lC_L}{2}
    840         \|\vec{p}\;^{k} - \vec{p}\;^{\dagger}\|^{2}_{\vec{P}} \\
    841         &\le \frac{C_lC_L\rho}{2}
    842         \|\vec{p}\;^{k}-\vec{p}\;^{\dagger}\|_{\vec{P}} \\
    843         &< \|\vec{p}\;^{k} - \vec{p}\;^{\dagger}\|_{\vec{P}},
    844     \end{align}
    845     meaning that $\vec{p}\;^{k+1} \in \mathcal{B}(\vec{p}\;^{\dagger};
    846     \rho)$, thereby the Gauss-Newton method is well defined. Also
    847     \begin{align}
    848         \|\vec{p}\;^{k+1} - \vec{p}\;^{\dagger}\|
    849         &\leq \left( \frac{C_lC_L\rho}{2}  \right)^{k+1}
    850         \|\vec{p}\;^{0} - \vec{p}\;^{\dagger}\| \\
    851         &\le \left( \frac{C_lC_L\rho}{2}  \right)^{k+1} \rho,
    852     \end{align}
    853     where $\frac{C_lC_L\rho}{2} < 1$, meaning that the Gauss-Newton method
    854     converges to $0$ as $k\to \infty$, quadratically. \qed
    855 \end{myproof}
    856 
    857 \begin{myproof}{Lemma \ref{lemma: immersion}}
    858     \label{proof: lem-immersion}
    859     \begin{itemize}
    860         \item Because of the differentiability assumptions of $\sigma$, for
    861     all fixed $\vec{p}$, we have that $D\Psi[\vec{p}\;] \in
    862     L^{2}\left([0,1]^{n}\right)$. So the Conjecture \ref{conj: main} directly
    863     implies that $\mathcal{R}(D\Psi[\vec{p}\;])$ is a linear subspace of
    864     $\mathbf{X}$ of dimension $n_*$.
    865 \item Note that the operator $D^{2}\Psi[\vec{p}\;]: \mathbb{R}^{n_*} \to
    866     L^{2}([0, 1]^{n})$ is continuous (assuming that the activation function
    867     $\sigma$ is twice differentiable. Considering a nonempty neighborhood
    868     $\mathcal{U}$ of $\vec{p}$ with a compact closure, the continuity of
    869     $D^{2}\Psi$ w.r.t $\vec{p}$ gives that $D\Psi$ is
    870     Fr\'echet-differentiable with Lipschitz-continuous derivative on
    871     $\mathcal{U}$. Meaning there exist constants $C_l, C_L > 0$ such that
    872     for all $\vec{q}, \vec{p} \in \mathcal{D}(\Psi)$ it holds that
    873         \begin{align}
    874             &\big\|\Psi'(\vec{p}\;) - \Psi'(\vec{q}\;)
    875             \big\|_{\vec{P}\to\mathbf{X}} \leq C_L \|\vec{p} - \vec{q} \|
    876             \\
    877             &\big\|\Psi'(\vec{p}) \big\|_{\vec{P} \to \mathbf{X}} \le
    878             C_l.
    879         \end{align}
    880         \qed
    881     \end{itemize}
    882 
    883 \end{myproof}
    884 
    885 
    886 
    887 
    888 
    889 \nocite{kaltenbacher2008}
    890 \nocite{frischauf2022universal}
    891 \printbibliography
    892 
    893 \end{document}