main.tex (39984B)
1 \include{./preamble.tex} 2 3 4 \begin{document} 5 6 \maketitle 7 8 \tableofcontents 9 10 \section{Intro} 11 The talk is about the paper \cite{scherzer2023newton}, in which the authors 12 consider using a modified version of the Newton algorithm to reconstruct the 13 coefficients of neural network coders in inverse problems. Instead of using 14 the inverse of the Jacobian in the iteration, the authors use a more general 15 Moore-Penrose inverse. It can be proven that under conditions this modified 16 form of the Newton iteration, calling it the Gauss-Newton iteration converges 17 locally, quadratically. The aim is to use these results on specifically 18 shallow neural networks, because they satisfy an immersion property, i.e. are 19 Lipschitz-differentiable immersions, which is the requisite of the 20 convergence of the Gauss-Newton method. Additionally, presented results clearly show a 21 quicker and reliable solution with its comparison, the Landweber method. 22 These results were also checked by me, in terms of writing the code and using 23 the same initial setup to come to the same conclusions \cite{code}. 24 25 \subsection{Posing the problem} 26 Consider linear operator equation between Hilbert spaces $\mathbf{X}$ and 27 $\mathbf{Y}$ 28 \begin{align} 29 F\mathbf{x} = \mathbf{y}. 30 \end{align} 31 For the problem modeling a function is introduced, called \textbf{Coding} 32 $\Psi: \vec{P} \to \mathbf{X}$ which maps NN parameters to images functions, 33 a nonlinear operator. The problem equation can be written as follows 34 \begin{align} 35 N(\vec{p}) = F\Psi(\vec{p}) = \mathbf{y}, \label{eq: main} 36 \end{align} 37 where $\mathbf{X}$ is the image space, $\mathbf{Y}$ the data space and $\vec{P}$ the parameter 38 space. In the case the operator in question, $F$ is nonlinear, then we would of 39 course have a nonlinear equation, which we are not considering right now. 40 41 \subsection{Decomposition cases (review)} 42 An operator $N$ satisfies the \textit{1st decomposition case} in an open empty 43 neighborhood $\mathcal{B}\left(\vec{p}\;^{\dagger}; \rho \right) \subseteq 44 \vec{P} $ (an open ball at point $\vec{p}\;^{\dagger}$ with radius $\rho$), if 45 there exists a linear operator $F:\vec{P}\to \mathbf{X}$ and a nonlinear 46 operator $\Psi:\mathbf{X} \to \mathbf{Y}$ such that. 47 \begin{align} 48 N(\vec{p}) = \Psi(F\vec{p}). 49 \end{align} 50 The \textit{2nd decomposition case} for operator $N$ in the same setting is 51 satisfied, if there exists a linear operator $F: \mathbf{X} \to \mathbf{Y}$ 52 and a nonlinear operator $\Psi: \vec{P} \to \mathbf{X}$ such that 53 \begin{align} 54 N(\vec{p}) = F\Psi(\vec{p}). 55 \end{align} 56 57 \subsection{Gauss-Newton type method for 2nd decomposition case} 58 The main object is the operator $\Psi:\mathcal{D} \subseteq \vec{P} := 59 \mathbb{R}^{n_*} \to \mathbf{X}$. The derivative of $\Psi$ is generally not 60 invertible. 61 \begin{align} 62 N(\vec{p}) = F\Psi(\vec{p}). 63 \end{align} 64 The aim is to use the modified version of the Newton method to reconstruct 65 the parameters $\vec{p}$. The prerequisite for this is some background on the 66 general local convergence of the Newton Method, conditions called 67 Newton-Mysovskii and the definition of the Moore-Penrose inverse. 68 \section{Background} 69 \subsection{Newton-Mysovskii} 70 The local convergence of the Newton method is guaranteed under the so called Newton-Mysovskii 71 conditions. In this section the results are summarized for the simple case in the 72 finite dimensional space, when the nonlinear operator has derivative which 73 are invertible. This result is going to be extended as aim of the summary. 74 \begin{theorem} 75 Let $N: \mathcal{D}(N) \subseteq \mathbb{R}^{n}\to \mathbb{R}^{n}$ be 76 continuously Fr\'echet differentiable on a non-empty, open and convex 77 set $\mathcal{D}\left( N \right) $. Let $\vec{p}\;^{\dagger} \in 78 \mathcal{D}(N)$ be the solution of $N(\vec{p}\;) = \mathbf{y}$, where 79 $\mathbf{y} \in \mathbb{R}^{n}$. Also assume that 80 \begin{enumerate} 81 \item $N'(\vec{p}\;)$ is invertible $\forall \vec{p} \in 82 \mathcal{D}(n)$ and 83 \item The Newton-Mysovskii condition hold, i.e. $\exists C_N > 0:$ 84 \begin{align} 85 &\big\| N'(\vec{p}\;)^{-1}\left( N'(\vec{q} + s(\vec{p} - 86 \vec{q}\;) - N'(\vec{q}\;)\right) (\vec{p} - \vec{q}) 87 \big\|_{\vec{P}} 88 \leq s C_N \|\vec{p} - \vec{q} \;\|^{2}_{\vec{P}}\\ 89 & \forall \vec{p}, \vec{q} \in \mathcal{D}(N), \; s \in[0, 90 1], 91 \end{align} 92 \end{enumerate} 93 Now let $\vec{p}\;^{0} \in \mathcal{D}(N)$ the starting point of the 94 Newton iteration, which should be sufficiently close to the solution, 95 i.e. satisfying 96 \begin{align} 97 &\overline{\mathcal{B}\left(\vec{p}\;^{0}, \rho \right)}\subseteq 98 \mathcal{D}(N) \qquad \text{with}\\ 99 &\rho := \|\vec{p}\;^{\dagger} - \vec{p}^{0}\|_{\vec{P}} \quad 100 \text{and} \quad h:= \frac{\rho C_l C_L}{2} <1. \label{eq: locality} 101 \end{align} 102 Then the Newton iteration with starting point $\vec{p}\;^{0}$ and 103 iterates $\left\{\vec{p}\;^{k} \right\}_{k \in \mathbb{N}_0} \subseteq 104 \overline{\mathcal{B}(\vec{p}\;^{0}, \rho)} $ of the 105 form 106 \begin{align} 107 \vec{p}\;^{k+1} = \vec{p}\;^{k} - N'(\vec{p}\;^{k})^{-1}\left( 108 N\left(\vec{p}\;^{k} \right) - \mathbf{y} \right) \qquad k \in 109 \mathbb{N}_0, 110 \end{align} 111 converge to $\vec{p}\;^{\dagger} \in \overline{\mathcal{B}(\vec{p}\;^{0}, 112 \rho)}$, \textbf{quadratically}. 113 \end{theorem} 114 115 \subsection{Moore-Penrose Inverse} 116 Now, the case where $\mathbf{Y}$ is an infinite dimensional Hilbert 117 space, is introduced. It is necessary to replace the inverse in the classical 118 Newton method because the liberalizations of the operator $N$, which 119 generally cannot be invertible. This is done by introducing the so called 120 Moore-Penrose inverse or more general the outer inverse and we refer to the 121 Gauss-Newton method to distinguish between the classical version. 122 \begin{mydef}{Inner, outer and Moore Penrose inverse 123 \label{def: moore-penrose}} 124 $L: \vec{P} \to \mathbf{Y}$ be a linear and bounded operator between 125 vector spaces $\vec{P}$ and $\mathbf{X}$. Then 126 \begin{enumerate} 127 \item $B: \mathbf{Y} \to \vec{P}$ is called \textbf{left-inverse} to 128 $L$ if 129 \begin{align} 130 BL = I 131 \end{align} 132 \item $B: \mathbf{Y} \to \vec{P}$ is called \textbf{right-inverse} to 133 $L$ if 134 \begin{align} 135 LB = I 136 \end{align} 137 \item $B: \vec{P} \to \vec{P}$ is called an \textbf{inverse} to 138 $L$ if it is both a left and a right inverse to $L$. 139 \item $B: \vec{P} \to \vec{P}$ is called an \textbf{outer inverse} to 140 $L$ if 141 \begin{align} 142 BLB = B 143 \end{align} 144 \item Let $\vec{P}$ and $\mathbf{Y}$ be Hilbert spaces, $L: \vec{P} 145 \to \mathbf{Y}$ be linear bounded operator. Denote the 146 orthogonal projections $P$ and $Q$ onto the nullspace of $L$, 147 $\mathcal{N}(L)$ closed and the closure of the range of $L$, 148 $\overline{\mathcal{R}\left(L \right)} $. This means that for all $\vec{p} 149 \in \vec{P}$ and $\mathbf{y} \in \mathbf{Y}$ we have 150 \begin{align} 151 &P\vec{p} = \text{argmin} 152 \left\{ 153 \|\vec{p}_1-\vec{p}\|_{\vec{P}} : \vec{p}_1 \in 154 \mathcal{N}(L) \right\},\\ 155 &Q\mathbf{y} = \text{argmin} 156 \left\{ 157 \|\mathbf{y}_1 - \mathbf{y}\|_\mathbf{Y}: \mathbf{y} \in 158 \overline{\mathcal{R}(L)} \right\} 159 \end{align} 160 Then the operator $B: \mathcal{D}(B) \subseteq \mathcal{Y} \to 161 \vec{P}$ with $\mathcal{B}(B):= \mathcal{R} \dotplus 162 \mathcal{R}^{\perp}$ is called \textbf{Moore-Penrose inverse} of 163 $L$ if the following conditions(identities) hold 164 \begin{align} 165 &LBL = L, \nonumber\\ 166 &BLB = B, \nonumber\\ 167 &BL= I-P, \label{eq: moore-penrose}\\ 168 &LB = Q|_{\mathcal{D}(B)} \nonumber. 169 \end{align} 170 171 \end{enumerate} 172 The left and right inverses are used in a different context. For a left 173 inverse the nullspace of $L$ has to be trivial, in contrast to $B$. 174 On the other hand for the right inverse the nullspace of $B$ needs to be 175 trivial. 176 177 178 \end{mydef} 179 180 \subsection{Lipschitz-differentiable immersion} 181 \begin{mydef}{Lipschitz-differentiable immersion} 182 Let there be $n_* = N*(n+2)$ neural nets depending on the parameters 183 $(\vec{\alpha}, \mathbf{w}, \vec{\theta})$. Let $\Psi'$ be 184 Lipschitz-continuous and 185 \begin{align} 186 \mathbf{X}_{\vec{p}} = 187 \text{span}\{\partial_{p_i}\Psi(\vec{p})\;:\;i=1,\ldots,n_*\}, 188 \label{eq: lpdi-property} 189 \end{align} 190 has $\text{rank}(n_*)$. 191 And let $\Psi'(\vec{p})^{\dagger}$ denote a generalized inverse, 192 which replaces the standard $\Psi^{-1}$ in the standard Newton's method. 193 TODO: more in detail definition. 194 \end{mydef} 195 Linking the inverse of the Lipschitz Lipschitz-differentiable immersion with 196 the Moore-Penrose inverse together with the following results delivers the 197 Gauss-Newton method 198 \begin{theorem} 199 \label{thm: moore-penrose} 200 \begin{enumerate} 201 \item The function $\Psi'(\vec{p}\;)^{\dagger}: \mathbf{X} \to \vec{P}$ 202 is the Moore-Penrose inverse of $\Psi'(\vec{p}\;)$. 203 \item For all $\vec{p} \in \mathcal{D}(\Psi) \subseteq \vec{P}$ 204 there is a non-empty closed neighborhood where 205 $\Psi'(\vec{p})^{\dagger}$ is uniformly bounded and it is 206 Lipschitz-continuous, i.e. 207 \begin{align} 208 &\|\Psi'(\vec{p}\;)^{\dagger} - 209 \Psi'(\vec{q}\;)^{\dagger}\|_{\mathbf{X}\to\vec{P}} 210 \leq C_L \|\vec{p} - \vec{q}\;\|_{\vec{P}}&\\ 211 &\|\Psi'(\vec{p}\;)^{\dagger} 212 \|_{\mathbf{X}\to\vec{P}} \leq C_l\qquad &\text{for}\;\; 213 \vec{p}, \vec{q}\in \mathcal{D}(\Psi). 214 \end{align} 215 \item The operator $P_{\vec{p}}: \mathbf{X} \to \mathbf{X}_{\vec{p}}$ is 216 bounded 217 \end{enumerate} 218 \end{theorem} 219 \begin{proof} 220 The proof can be found in the appendix \ref{proof: thm-moore-penrose} 221 \end{proof} 222 223 We can now wrap the results back to the original problem of the Gauss-Newton 224 method \ref{eq: main} 225 \begin{lemma} 226 \label{lem: moore-penrose} 227 Let $F$ be as in \ref{eq: main} linear, bounded with trivial nullspace 228 and dense range (for the Moore-Penrose inverse). Let $\Psi: 229 \mathcal{D}(\Psi)\subseteq \mathbb{R}^{n_*} \to \mathbf{X}$ be a 230 Lipschitz-differentiable immersion. And $N = F\circ \Psi$ \ref{eq: main}. 231 Here it is important to see the immanent result that for $N$, 232 $\mathcal{D}(N) = \mathcal{D}(\Psi)$, therefore $\forall \vec{p} \in 233 \mathcal{D}$ the derivative of the operator $N$ has a Moore-Penrose 234 inverse $N'(\vec{p}\;)^{\dagger}$, satisfying 235 \begin{enumerate} 236 \item The decomposition property of the Moore-Penrose inverse 237 \begin{align} 238 N'(\vec{p}\;)^{\dagger}\mathbf{z} = 239 \Psi'(\vec{p}\;)^{\dagger}F^{-1}\mathbf{z} \qquad \forall 240 \vec{p} \in \mathcal{D}(N),\; \mathbf{z} \in \mathcal{R}(F) 241 \subseteq \mathbf{Y} 242 \end{align} 243 which means that 244 \begin{align} 245 &N'(\vec{p}\;)^{\dagger}N'(\vec{p}\;) = I \quad \text{on} \;\; 246 \mathbb{R}^{n_*} \qquad \text{and}\\ 247 &N(\vec{p}\;)N'(\vec{p}\;)^{\dagger} = 248 \mathcal{Q}|_{\mathcal{R}(FP_{\vec{p}})}, 249 \end{align} 250 where $\mathcal{Q} : \mathbf{Y} \to 251 \overline{\mathcal{R}(FP_{\vec{p}})}\dotplus\mathcal{R}(FP_{\vec{p}})^{\perp}$. 252 So in the definition of the Moore-Penrose 253 \ref{def: moore-penrose}, 254 $P$ in \ref{eq: moore-penrose} is $P \equiv 0$. 255 \item The generalized Newton-Mysovskii conditions are also satisfied 256 \begin{align} 257 &\big\| N'(\vec{p}\;)^{\dagger}\left( N'(\vec{q} + s(\vec{p} - 258 \vec{q}\;) - N'(\vec{q}\;)\right) (\vec{p} - \vec{q}) 259 \big\|_{\vec{P}} 260 \leq s C_l C_L \|\vec{p} - \vec{q} \;\|^{2}_{\vec{P}}\\ 261 & \forall \vec{p}, \vec{q} \in \mathcal{D}(N), \; s \in[0, 262 1], 263 \end{align} 264 where $C_l, C_L$ are the Lipschitz-constants. 265 \end{enumerate} 266 \end{lemma} 267 \begin{proof} 268 The proof can be found in the appendix \ref{proof: lem-moore-penrose} 269 \end{proof} 270 271 Bringing it all together to show the local convergence of the Gauss-Newton 272 method, where $N = F\circ \Psi$ is a composition of a linear bounded operator 273 and a Lipschitz-differentiable immersion. 274 \begin{theorem} 275 \label{thm: gauss-newton-convergence} 276 Assume there exists a $\vec{p}\;^{\dagger} \in \mathcal{D}(\Psi)$ which 277 satisfies 278 \begin{align} 279 N(\vec{p}\;^{\dagger}) = \mathbf{y}, 280 \end{align} 281 and assume there exists a $\vec{p}\;^{0} \in \mathcal{D}(\Psi)$ as in 282 \ref{eq: locality}, satisfying locality, Then the iterates 283 $\{\vec{p}\;^{k}\}_{k\in \mathbb{N}_0}$ of the Gauss-Newton method of 284 the form 285 \begin{align} 286 \vec{p}\;^{k+1} = \vec{p}\;^{k} - N'(\vec{p}\;^{k})^{\dagger}\left( 287 N\left(\vec{p}\;^{k} \right) - \mathbf{y} \right) \qquad k \in 288 \mathbb{N}_0, \label{eq: gauss-newton-iteration} 289 \end{align} 290 are well-defined in $\overline{\mathcal{B}\left(\vec{p}\;^{0}, \rho 291 \right) }$ and converge quadratically to $\vec{p}\;^{\dagger}$. 292 \end{theorem} 293 \begin{proof} 294 The proof can be found in the appendix \ref{proof: thm-gauss-newton-convergence} 295 \end{proof} 296 297 298 \subsection{Neural networks} 299 Shallow neural network coders are of the following form 300 \begin{align} 301 \label{eq: shallow-nn} 302 \Psi: 303 \mathcal{D}(\Psi) := \mathbb{R}^{n_*} = 304 \mathbb{R}^{N}\times \mathbb{R}^{n \times N} 305 \times \mathbb{R}^{N} 306 &\to \mathbf{X} := 307 L^{2}\left([0, 1]^{n}\right),\\ 308 \vec{p} = (\vec{\alpha}, \mathbf{w}, \vec{\theta}) &\mapsto 309 \left(\vec{x} \to \sum_{j=1}^{N} \alpha_j\sigma\left( 310 \vec{\mathbf{w}}_j^{T}\vec{x} + \theta_j \right) \right), 311 \end{align}q 312 where $\sigma$ is an activation function, such as tanh or sigmoid. 313 314 A standard deep neural network (DNN) with $L$ layers is a function depending on $\vec{x} \in 315 \mathbb{R}^{n}$ with parameters $\vec{p}:=\left( \vec{\alpha}_l, 316 \mathbf{w}_l, \vec{\theta}_l \right)_{l=1}^{L}$ 317 \begin{align} 318 \vec{x}\to\Psi(\vec{x}) := \sum_{j_L=1}^{N_L} \alpha_{j_L,L}\sigma_L\ 319 \left( p_{j_L, L} \left( \sum_{j_{L-1}=1}^{N_{L-1}}\cdots 320 \left( \sum_{j_1=1}^{N_1}\alpha_{j_1,1}\sigma_1\left(\rho_{j_1,1}(\vec{x}) 321 \right) \right) \right) \right), 322 \end{align} 323 where 324 \begin{align} 325 \rho_{j_l}(\vec{x}) = \mathbf{w}_{j, l}^{T}\vec{x} + \theta_{j,l}, 326 \end{align} 327 with $\alpha_{j,l}, \theta_{j,l} \in \mathbb{R}$ and $\vec{x}, 328 \mathbf{w}_{j,l} \in \mathbb{R}^{n} \;\; \forall l=1,\ldots,L$. And is 329 probably not a Lipschitz-continuous immersion! 330 331 Then the inverse problem 332 \begin{align} 333 N(\vec{p}\;) = F\Psi(\vec{p}\;) = \mathbf{y}, 334 \end{align} 335 can be solved using variational methods, e.g. Tikhonov regularization 336 \begin{align} 337 \|N(\vec{p}) - \mathbf{y}\|^{2} + \alpha \|\vec{p}\|^{2} \to \min, 338 \end{align} 339 or alternatively state space regularization 340 \begin{align} 341 \|N(\vec{p}) - \mathbf{y}\|^{2} 342 + \alpha \|\mathbf{x} - \mathbf{x}_0\|^{2} 343 \to \min \quad \text{s.t} \quad \Psi(\vec{p}) = \mathbf{x}. 344 \end{align} 345 Here the analysis the iterative method, the Gauss-Newton method is 346 considered. 347 \begin{align} 348 \vec{p}\;^{k+1} = \vec{p}\;^{k} - 349 N'( \vec{p}\;^{k})^{-1} (N(\vec{p}\;^{k}) - 350 \mathbf{y} ), 351 \end{align} 352 where $N'$ is the Jacobian. And we assume that the nonlinear operator $\Psi$ is well-posed. 353 354 355 \section{Newton's method with the neural network operator} 356 In this section the main results of the talk are explained. The aim is to use 357 the universal approximation properties of neural networks, the fact that they 358 can approximate continuous functions arbitrarily well, to the inverse problem 359 in \ref{eq: main} using the Gauss-Newton method. To ensure convergence it is 360 necessary to show that the considered neural network structure is a 361 Lipschitz-differentiable immersion. As it will be summarized, a direct implication 362 of this is to show that the activation function, its 363 derivative and its first moment of the derivative are linearly independent. 364 For this, results from \cite{lamperski_2022} are used and it is conjectured 365 that the statement from \ref{eq: lpdi-property} is fulfilled, meaning that 366 the functions forming $\mathbf{X}_{\vec{p}}$ are linearly independent. 367 \newline 368 Convergence is based on the immersion property of the network functions 369 \begin{align} 370 \text{span}\{\partial_{p_i}\Psi(\vec{p})\;:\;i=1,\ldots,n_*\}, \qquad 371 \text{has rank}(n_*). 372 \end{align} 373 To show the Newton-Mysovskii conditions for neural network functions 374 following notation is adapted 375 \begin{align} 376 \vec{p} := (\vec{\alpha}, \mathbf{w}, \vec{\theta}) \in 377 \mathbb{R}^{N}\times \mathbb{R}^{n\cdot N} \times \mathbb{R}^{N} = 378 \mathbb{R}^{n_*}. 379 \end{align} 380 381 For $\alpha_i \neq 0$, this, in particular, requires that the functions 382 \begin{align} 383 & \frac{\partial \Psi}{\partial \alpha_s} =\sigma(\rho),\quad 384 \frac{\partial \Psi}{\partial w_s^{t}} =\sigma'(\rho)x_t,\quad 385 \frac{\partial \Psi}{\partial \theta_s} =\sigma'(\rho),\\ 386 & \text{where} \qquad 387 \rho = \sum_{i=1}^{n}w_s^{i}x_i + \theta_s, \\ 388 \end{align} 389 assuming that the activation function is continuously differentiable, 390 are \textbf{linearly independent} and that $\alpha_s \neq 0$ - 391 \textbf{sparse} coefficients cannot be recovered. 392 \subsection{Linear independence problem} 393 The question is if 394 \begin{align} 395 \frac{\partial \Psi}{\partial \alpha_s} , 396 \frac{\partial \Psi}{\partial w_s^{\dagger}} , 397 \frac{\partial \Psi}{\partial \theta_s} \label{eq: linear_indep} 398 \end{align} 399 Partial answer for $\frac{\partial \Psi}{\partial \alpha_s} (\vec{x}) = 400 \sigma\left( \sum_{i=1}^{n} w_s^{i}x_i + \theta_s \right)$ in the 401 \cite{lamperski_2022} theorem: 402 \begin{theorem} 403 \label{thm: lamperski} 404 For all activation functions \textit{HardShrink, HardSigmoid, HardTanh, 405 HardSwish, LeakyReLU, PReLU, ReLU, ReLU6, RReLU, SoftShring, Threshold, 406 LogSigmoid, Sigmoid, SoftPlus, Tanh and TanhShring and the PyTorch 407 functions CELU, ELU, SELU} the shallow neural network functions formed by 408 \textbf{randomly generated vectors} $(\mathbf{w}, \vec{\theta})$ are 409 \textbf{linearly independent}. 410 \end{theorem} 411 But here it is also needed more that the first derivative of the sigmoid functions and 412 the first moment of the first derivative together with the above result are 413 linearly independent. But the answer is not satisfactory because its not 414 known. More or less with a probability $1$ it can be proven that the functions 415 above are linearly independent. 416 417 For the sigmoid function we have some symmetries because 418 \begin{align} 419 \sigma'(\mathbf{w}^{T}_j \vec{x} + \theta_j) 420 = \sigma'\left(-\mathbf{w}_j^{T}\vec{x} - \theta_j \right) 421 \end{align} 422 or in another formulation 423 \begin{align} 424 \Psi'(\vec{\alpha}, \mathbf{w}, \vec{\theta}) = \Psi'(\vec{\alpha}, 425 -\mathbf{w}, -\vec{\theta}) 426 \end{align} 427 Conjecture: obvious symmetries = "random set" from \cite{lamperski_2022}. It 428 is conjectured that the above functions in \ref{eq: linear_indep} are 429 linearly independent and the results are build from here on out. 430 431 \begin{conjecture} 432 \label{conj: main} 433 Assume that the functions from \ref{eq: linear_indep} are locally 434 linearly independent. 435 \end{conjecture} 436 From the above Conjecture it follows that the shallow network coders are a 437 Lipschitz-differentiable immersion (for a suitable choice of an activation 438 function), so Gauss-Newton method converges locally. 439 \subsection{Gauss-Newton iteration with coding networks} 440 A direct implication of the local convergence of the Gauss-Newton method is 441 the immersion property. In this section the proof of the 442 Lipschitz-differentiable immersion for the shallow neural network coders and 443 the convergence of the Gauss-Newton method are going to be summarized. 444 \begin{lemma} 445 \label{lemma: immersion} 446 Let the activation function $\sigma$ be one of the function from 447 \ref{thm: lamperski} \cite{lamperski_2022}. Let $F:\mathbf{X}=L^{2}\left( 448 [0, 1]^{n}\right) \to \mathbf{Y}$ be linear, bounded and with trivial 449 nullspace and closed range. Assume the Conjecture \ref{conj: main} 450 holds, then 451 \begin{itemize} 452 \item $\forall \vec{p} = (\vec{\alpha}, \mathbf{w}, \vec{\theta}) \in 453 \mathbb{R}^{n_*} \subseteq \mathcal{D}(\Psi)$, 454 $\mathcal{R}(D\Psi[\vec{p}\;])$ is a linear 455 subspace of $\mathbf{X}$ of dimension $n_*$ 456 \item There exists an open neighborhood $\mathcal{U} \subseteq 457 \mathbb{R}^{n_*}$ of vectors $\vec{p}$ s.t. $\Psi$ is a 458 Lipschitz-differentiable immersion in $\mathcal{U}$. 459 \end{itemize} 460 \end{lemma} 461 \begin{proof} 462 The proof can be found in the appendix \ref{proof: lem-immersion} 463 \end{proof} 464 465 466 The finial results states the local convergence of the Gauss-Newton method 467 for shallow network coders. 468 \begin{theorem} 469 Using the same setup and assumptions as above, additionally let $\vec{p} 470 \in \mathbf{D}(\Psi)$ be a starting point of the Gauss-Newton iteration 471 \ref{eq: gauss-newton-iteration} and $\vec{p}\;^{\dagger} \in 472 \mathcal{D}(\Psi)$ the solution of equation \ref{eq: main}. Then the 473 Gauss-Newton method converges quadratically if 474 $\vec{p}\;^{0}$ is sufficiently close to $\vec{p}\;^{\dagger}$. 475 \begin{align} 476 \vec{p}\;^{k+1} = \vec{p} 477 \end{align} 478 \end{theorem} 479 \begin{proof} 480 481 Trivially applying Lemma \ref{lemma: immersion} to Theorem 482 \ref{thm: gauss-newton-convergence}. 483 \end{proof} 484 485 486 487 488 \section{Results} 489 In the following section the results presented in the paper 490 \cite{scherzer2023newton} are rerun. 491 \subsection{Numerical results(simplified)} 492 The simplification is 493 \begin{align} 494 &N = F \circ \Psi \\ 495 &\mathbf{y}^{\dagger} = F\Psi(\vec{p}\;^{\dagger}) \qquad \text{is 496 attainable} 497 \end{align} 498 The aim is to recostruct the coefficients of a 2d neural network function 499 \ref{eq: shallow-nn} with 500 \begin{align} 501 \vec{p}\;^{\dagger} = \left( 502 \begin{pmatrix} 1.0\\1.0\\0.1\\0.1 \end{pmatrix}; 503 \begin{pmatrix} 0.3\\0.1\\1.0\\0.8 \end{pmatrix} \right). 504 \end{align} 505 Meaning that it is aimed to reconstruct the following function 506 \begin{align} 507 \Psi^{\dagger}(\vec{x}) 508 &= \Psi(\vec{p}\;^{\dagger})(\vec{x})\\ 509 &=1.0\sigma\left( (1.0,0.1)^{T}\vec{x} + 0.1 \right) 510 +0.3 \sigma\left( (0.1, 1.0)^{T}\vec{x} + 0.8 \right). 511 \end{align} 512 The stopping criterion is $\|\Psi(\vec{p}\;^{k_s+1}) - \Psi^{\dagger})\| \le 513 \delta = 0.001$ 514 515 \begin{figure}[H] 516 \centering 517 \includegraphics[width=0.8\textwidth]{./pics/gn_coeff.png} 518 \caption{Coefficients, left truth, middle starting point, right 519 iteration stop} 520 \label{fig: gn_coeff} 521 \end{figure} 522 The below figure shows the standard loss used for the breaking criterion and 523 the residual loss 524 \begin{align} 525 \log \left( \frac{\|\Psi(\vec{p}\;^{k+1}) - 526 \Psi^{\dagger}\|}{\|\Psi(\vec{p}\;^{k}) - \Psi^{\dagger}\|} \right) 527 \end{align} 528 \begin{figure}[H] 529 \centering 530 \includegraphics[width=0.8\textwidth]{./pics/gn_loss.png} 531 \caption{Gauss-Newton loss, left standard norm loss, right residual loss} 532 \label{fig: gn_loss} 533 \end{figure} 534 535 536 For comparison with the Gauss-Newton iteration, consider the Landweber iteration 537 \begin{align} 538 \vec{p}\;^{k+1} = \vec{p}\;^{k} - \lambda \Psi'\left(\vec{p}\;^{k} \right)^{\dagger} 539 \left( \Psi(\vec{p}\;^{k}) - \Psi^{\dagger} \right) \qquad k \in 540 \mathbb{N}_0. 541 \end{align} 542 Around $8000$ iteration are needed to reach the same accuracy that the 543 Gauss-Newton method reaches in $5$, with $\lambda = 0.02$ (e.g. $\lambda=0.2$ 544 diverges). 545 \begin{figure}[H] 546 \centering 547 \includegraphics[width=0.8\textwidth]{./pics/lw_coeff.png} 548 \caption{Coefficients, left truth, middle starting point, right 549 iteration stop} 550 \label{fig: lw_coeff} 551 \end{figure} 552 \begin{figure}[H] 553 \centering 554 \includegraphics[width=0.8\textwidth]{./pics/lw_loss.png} 555 \caption{Landweber loss, left standard norm loss, right residual loss} 556 \label{fig: lw_loss} 557 \end{figure} 558 559 If the observed convergence rate of the Gauss-Newton can change completely if the 560 solution is not attainable. Then the conjecture is that the non-convergence 561 because of multiple solutions, which can be observed when not using enough 562 reconstruction data. 563 Also the implementation of the simplified Gauss-Newton requires inversion of 564 $F$ , which is not done in practice, this is not true for Landweber. 565 \subsection{Alternative to DNNs} 566 Instead of using Deep Neural Networks where it is unknown if 567 the immersion is invertible, another considerable option is to use 568 Quadratic neural network functions defined as follows 569 \begin{align} 570 \Psi(\vec{x}) := \sum_{j=1}^{N} \alpha_j\sigma\left(\vec{x}^{T}A_j\vec{x} 571 + \mathbf{w}_j^{T}\vec{x} + \theta_j \right), 572 \end{align} 573 with $\alpha_j, \theta_j \in \mathbb{R}, \mathbf{w}_j \in \mathbb{R}^{n}$ 574 and $A_j \in \mathbb{R}^{n \times n}$. We can also constrain the class of 575 $A_j$ and $\mathbf{w}_j$ which leads us to circular networks, circular 576 affine, elliptic, parabolic... 577 \begin{theorem} 578 Quadratic neural network functions satisfy the universal approximation 579 property. 580 \end{theorem} 581 The immersion property of circular network functions 582 \begin{align} 583 \Psi(\vec{x}) := \sum_{j=1}^{N} \alpha_j\sigma\left(r_j\vec{x}^{T}\vec{x} 584 + \mathbf{w}_j^{T}\vec{x} + \theta_j \right), 585 \end{align} 586 and 587 \begin{align} 588 \text{span}\{\partial_{p_i}\Psi(\vec{p})\;:\;i=1,\ldots,n_*\}, \qquad 589 \text{has rank}(n_*). 590 \end{align} 591 For $\alpha_i \neq 0$, this in particular requires that the functions 592 \begin{align} 593 \frac{\partial \Psi}{\partial r_s} = \sigma\left( \rho \right) 594 \vec{x}^{T}\vec{x}, \quad 595 \frac{\partial \Psi}{\partial \alpha_s} = \sigma\left( \rho \right) 596 , \quad 597 \frac{\partial \Psi}{\partial w_s^{t}} = \sigma'\left( \rho \right) 598 x_t,\quad 599 \frac{\partial \Psi}{\partial \theta_s} = \sigma'\left( \rho \right) 600 \end{align} 601 are \textbf{linearly independent}. 602 \newline 603 604 Results of these types of networks is that the Shepp-Logan can be represented 605 with \textbf{10 nodes} with elliptic neurons and \textbf{one layer}. Where as 606 for affine networks, both shallow and deep we need infinity neurons. 607 608 \appendix 609 \section{Proofs} 610 The following section is dedicated to fill the proofs to the theorems and 611 lemmas in the main text. 612 \begin{myproof}{Theorem \ref{thm: moore-penrose}} 613 \label{proof: thm-moore-penrose} 614 \begin{enumerate} 615 \item To show that $\Psi'(\vec{p}\;)^{\dagger}$ is indeed the 616 Moore-Penrose inverse we need to verify the four identities given 617 in \ref{eq: moore-penrose}. Aligning the notation in the 618 difinition of the Moore-Penrose inverse then 619 \begin{align} 620 &L = \Psi'(\vec{p}\;) : \vec{P}=\mathbb{R}^{n_*} \to 621 \mathbf{X}, \\ 622 & B = \Psi'(\vec{p}\;)^{\dagger}: \mathbf{X} \to \vec{P}, \\ 623 &\mathcal{D}(B) = \mathcal{D}(\Psi'(\vec{p}\;)^{\dagger}) = 624 \mathbf{X}, \\ 625 &P: \vec{P} \to \vec{P} \qquad \text{the zero operator},\\ 626 &Q = P_{\vec{p}}:\mathbf{X} \to \mathbf{X}_{\vec{p}} 627 \qquad \text{the projection operator to } 628 \mathbf{X}_{\vec{p}}. 629 \end{align} 630 For the third identity with $P = 0$, let $\vec{q} = 631 (q_i)_{i=1}^{n_*} \in \vec{P}$ and we have 632 \begin{align} 633 \Psi'(\vec{p}\;)^{\dagger}\Psi'(\vec{p}\;)\vec{q} &= 634 \Psi'(\vec{p}\;)^{-1}\left( \sum_{i=1}^{n_*} q_i 635 \partial_{p_i} \Psi(\vec{p}\;) \right) \\ 636 &=\left(q_i \right)_{i=1}^{n_*} \\ 637 &= \vec{q}. 638 \end{align} 639 The fourth identity we use the fact that $(\partial_{p} 640 \Psi(\vec{p}\;))_{i=1}^{n_*}$ is a basis, so for all $\mathbf{x} 641 = (\mathbf{x_1}, \mathbf{x_2}) \in \mathbf{X}$ there exists an 642 $x_i$, $i \in \{1,\ldots,n_*\}$ such that we can represent any 643 $\mathbf{x}$ through 644 \begin{align} 645 \mathbf{x} = \sum_{i=1}^{n_*} 646 x_i\partial_{p_i}\Psi(\vec{p}\;) + \mathbf{x_2}, 647 \end{align} 648 with $\mathbf{x}_2 \in \mathbf{X}^{\perp}_{\vec{p}}$. Meaning that 649 \begin{align} 650 P_{\vec{p}}\;\mathbf{x} = \sum_{i=1}^{n_*} x_i\partial_{p_i} 651 \Psi'(\vec{p}\;) \qquad \text{and},\\ 652 \Psi'(\vec{p}\;)^{\dagger}\mathbf{x} = (x_i)_{i=1}^{n_*}=\vec{x}. 653 \end{align} 654 Then the identity falls down to 655 \begin{align} 656 \Psi'(\vec{p}\;)\Psi'(\vec{p}\;)^{\dagger}\mathbf{x} = 657 \Psi'(\vec{p}\;)\vec{x} = P_{\vec{p}}\;\vec{x} 658 \end{align} 659 For the second identity, use the results from the first and the 660 fourth identity then fact that $\forall \mathbf{x} \in 661 \mathbf{X}$: 662 \begin{align} 663 \Psi'(\vec{p}\;)^{\dagger}\Psi'(\vec{p}\;) 664 \Psi'(\vec{p}\;)^{\dagger}\mathbf{x} 665 &= \Psi'(\vec{p}\;)^{\dagger} 666 \Psi'(\vec{p}\;)\vec{x} \\ 667 &= \vec{x}\\ 668 &= \Psi(\vec{p}\;)^{\dagger} \mathbf{x}. 669 \end{align} 670 For the first identity using the previous results $\forall \vec{q} 671 \in \vec{P}$: 672 \begin{align} 673 \Psi'(\vec{p}\;) 674 \Psi'(\vec{p}\;)^{\dagger} 675 \Psi'(\vec{p}\;) 676 &= \Psi'(\vec{p}\;) \vec{q}\\ 677 (&= L). 678 \end{align} 679 680 \item For the boundary constraint 681 \end{enumerate} 682 \end{myproof} 683 684 \begin{myproof}{Lemma \ref{lem: moore-penrose}} 685 \label{proof: lem-moore-penrose} 686 \begin{enumerate} 687 \item 688 First of all by the chain rule it holds that 689 \begin{align} 690 N'(\vec{p}\;) = F\Psi'(\vec{p}\;) \qquad 691 \forall \vec{p} \in \mathcal{D}(\Psi) = \mathcal{D}(N). 692 \end{align} 693 To prove the decomposition property of the Moore-Penrose 694 inverse it is necessary to verify the equations 695 \ref{eq: moore-penrose} defining it. With the notation 696 \begin{align} 697 &L:= N'(\vec{p}\;) = F\Psi'(\vec{p}\;): \vec{P} \to 698 \mathbf{Y} \\ 699 &B:= \Psi'(\vec{p}\;)^{\dagger}F^{-1} : \mathcal{R}(F) 700 \subseteq \mathbf{Y} \to \vec{P}. 701 \end{align} 702 Also note that by assumption $F$ has a dense range and consider 703 $B$ on $\mathcal{R}(F) \dotplus 704 \underbrace{\mathcal{R}(F)^{\perp}}_{=\{0\}}$. Thereby from for 705 a fixed $\vec{p}$ the notation of \ref{eq: moore-penrose} is 706 \begin{align} 707 &\mathcal{D}(B) = 708 \mathcal{D}(\Psi'(\vec{p}\;)^{\dagger}F^{-1}) = 709 \mathcal{R}(F),\\ 710 &\mathcal{R}(L) = \big\{F\Psi'(\vec{p}\;)\vec{q}\;:\;\vec{q} 711 \in \mathbb{R}^{n_*} \big\} = \mathcal{R}(FP_{\vec{p}}). 712 \end{align} 713 Note that $Q$ is an orthogonal projection onto 714 $\overline{\mathcal{R}(L)}$, i.e. 715 $\overline{\mathcal{R}(FP_{\vec{p}})}$, then for 716 \begin{align} 717 \mathbf{z} &= F\mathbf{x} \\ 718 &= FP_{\vec{p}}\; 719 \mathbf{x} + F\left(I-P_{\vec{p}} \right) \mathbf{x} 720 \end{align} 721 we have that 722 \begin{align} 723 Q\mathbf{z} &= Q\left(FP_{\vec{p}}\;\mathbf{x} 724 + F(I-P_{\vec{p}})(\mathbf{x})\right) \\ 725 &= FP_{\vec{p}}\; \mathbf{x}. 726 \end{align} 727 Finally applying Lemma \ref{thm: moore-penrose} and the 728 invertability of $F$ on the range of $F$ shows that 729 \begin{align} 730 LBL &= F\Psi'(\vec{p}\;)\Psi'(\vec{p}\;)^{\dagger} 731 F^{-1}F\Psi'(\vec{p}\;) \\ 732 &= 733 F\Psi'(\vec{p}\;) 734 \Psi'(\vec{p}\;)^{\dagger}\Psi'(\vec{p}\;) \nonumber\\ 735 &= F \Psi'(\vec{p}\;) \nonumber \\ 736 &= L \nonumber \\ 737 \nonumber \\ 738 BLB &= \Psi'(\vec{p}\;)^{\dagger}F^{-1}F\Psi'(\vec{p}\;) 739 \Psi'(\vec{p}\;)^{\dagger}F^{-1} \\ 740 &= 741 \Psi'(\vec{p}\;)^{\dagger} 742 \Psi'(\vec{p}\;)\Psi'(\vec{p}\;)^{\dagger}F^{-1} \nonumber\\ 743 &= \Psi'(\vec{p}\;)^{\dagger}F^{-1} \nonumber \\ 744 &= B \nonumber \\ 745 \nonumber \\ 746 BL &= \Psi'(\vec{p}\;)^{\dagger}F^{-1}F\Psi'(\vec{p}\;) \\ 747 &= I - P \\ 748 &= I \quad \text{on } \mathbb{R}^{n_*}. 749 \end{align} 750 The fourth identity is proven by taking a $\mathbf{z} \in 751 \mathcal{R}(F)$, so there exists a $\mathbf{x} \in \mathbf{X}$ 752 such that $F\mathbf{x} = \mathbf{z}$, then it follows 753 \begin{align} 754 LB\mathbf{z} &= 755 F\Psi'(\vec{p}\;)\Psi'(\vec{p}\;)^{\dagger} 756 F^{-1}\mathbf{z} \\ 757 &= F\Psi'(\vec{p}\;)\Psi'(\vec{p}\;)^{\dagger} 758 \mathbf{x} \\ 759 &= FP_{\vec{p}}\mathbf{x} \\ 760 &= Q\mathbf{z}. 761 \end{align} 762 \item As for the generalized Newton-Mysovskii conditions, the 763 operator $\Psi'(\vec{p})$ is locally bounded and locally 764 Lipschitz continuous on $\mathcal{D}(\Psi)$, meaning that there are 765 constants $C_L, C_l > 0$ such that $\forall \vec{p}, \vec{q} \in 766 \mathcal{D}(\Psi)$ the following inequalities hold 767 \begin{align} 768 &\big\|\Psi'(\vec{p}\;) - \Psi'(\vec{q}\;) 769 \big\|_{\vec{P}\to\mathbf{X}} \leq C_L \|\vec{p} - \vec{q} \| 770 \\ 771 &\big\|\Psi'(\vec{p}) \big\|_{\vec{P} \to \mathbf{X}} \le 772 C_l. 773 \end{align} 774 Thus 775 \begin{align} 776 &\Big\| N'(\vec{p}\;)^{\dagger}\left( N'(\vec{q} + s(\vec{p} - 777 \vec{q}\;) - N'(\vec{q}\;)\right) 778 (\vec{p} - \vec{q}) \Big\|_{\vec{P}} \\ 779 &= \Big\| 780 \Psi'(\vec{p}\;)^{\dagger}F^{-1}\left( F\Psi'(\vec{q}\; 781 + s\left( \vec{p} - \vec{q}\; \right)-F\Psi'(\vec{q}\;) 782 \right)) \left(\vec{p} - \vec{q} \right) \Big\|_{\vec{P}} 783 \nonumber \\ 784 &= \Big\| 785 \Psi'(\vec{p}\;)^{\dagger}\left(\Psi'(\vec{q}\; 786 + s\left( \vec{p} - \vec{q}\; \right)-F\Psi'(\vec{q}\;) 787 \right)) \left(\vec{p} - \vec{q} \right) \Big\|_{\vec{P}} 788 \nonumber\\ 789 &\leq C_lC_L s \|\vec{p} - \vec{q} \|^{2}_{\vec{P}}, 790 \nonumber 791 \end{align} 792 for all $\vec{p}, \vec{q} \in \mathcal{D}(\Psi) = \mathcal{D}(N)$. 793 794 \end{enumerate} 795 796 \end{myproof} 797 798 \begin{myproof}{Theorem \ref{thm: gauss-newton-convergence}} 799 \label{proof: thm-gauss-newton-convergence} 800 The distance to the solution and the first iterate is $\rho = 801 \|\vec{p}\;^{\dagger} - \vec{p}\;^{0}\|$. Also $\mathcal{D}(\Psi) = 802 \mathcal{D}(N)$ since $F$ is defined all over $\mathcal{X}$. First of all 803 we have to prove that all iterates lie in the closed ball, i.e. 804 $\vec{p}\;^{k} \in \overline{\mathcal{B}(\vec{p}\;^{\dagger}; \rho)}$., 805 this is done by induction. The case $k=0$ is fulfilled by the existence assumption. 806 For $\vec{p}\;^{k}$ use the equations \ref{eq: moore-penrose} 807 \begin{align} 808 &N'(\vec{p}\;^{k})N'(\vec{p}\;^{k})^{\dagger}N'(\vec{p}\;^{k}) 809 \left(\vec{p}\;^{k+1} - \vec{p}\;^{\dagger} \right)\\ 810 &= N'(\vec{p}\;^{k})(\vec{p}\;^{k+1} - \vec{p}\;^{k}). 811 \end{align} 812 Then the definition of the Gauss-Newton method 813 \ref{eq: gauss-newton-iteration} implies 814 \begin{align} 815 &N'(\vec{p}\;^{k})(\vec{p}\;^{k+1}-\vec{p}\;^{\dagger}) =\\ 816 &=N'(\vec{p}\;^{k})N'(\vec{p}\;^{k})^{\dagger}\left( 817 N(\vec{p}\;^{\dagger})-N(\vec{p}\;^{k}) 818 - N'(\vec{p}\;^{k})(\vec{p}\;^{\dagger} - \vec{p}\;^{k})\right) . 819 \end{align} 820 Now using the third identity in \ref{eq: moore-penrose}, by the assumption 821 of this theorem $P = 0$ and using the second identity in 822 \ref{eq: moore-penrose} with the fact that $F$ is injective, it follows 823 that 824 \begin{align} 825 \vec{p}\;^{k+1} - \vec{p}\;^{\dagger} 826 &= 827 N'(\vec{p}\;^{k})^{\dagger}N'(\vec{p}\;^{k})(\vec{p}\;^{k+1} - 828 \vec{p}^{\dagger}) \\ 829 &= 830 N'(\vec{p}\;^{k})^{\dagger}\left(N(\vec{p}\;^{\dagger})-N(\vec{p}\;^{k}) 831 - N'(\vec{p}\;^{k})(\vec{p}\;^{\dagger} - \vec{p}\;^{k}) \right) \\ 832 &= \Psi'(\vec{p}\;^{k})^{\dagger}\left(\Psi(\vec{p}\;^{\dagger}- 833 \Psi(\vec{p}\;^{k}) - 834 \Psi'(\vec{p}\;^{k})(\vec{p}\;^{\dagger}-\vec{p}\;^{k}) \right). 835 \end{align} 836 Finally the Newton-Mysovskii conditions give use 837 \begin{align} 838 \|\vec{p}\;^{k+1} - \vec{p}\;^{\dagger}\| 839 &\le \frac{C_lC_L}{2} 840 \|\vec{p}\;^{k} - \vec{p}\;^{\dagger}\|^{2}_{\vec{P}} \\ 841 &\le \frac{C_lC_L\rho}{2} 842 \|\vec{p}\;^{k}-\vec{p}\;^{\dagger}\|_{\vec{P}} \\ 843 &< \|\vec{p}\;^{k} - \vec{p}\;^{\dagger}\|_{\vec{P}}, 844 \end{align} 845 meaning that $\vec{p}\;^{k+1} \in \mathcal{B}(\vec{p}\;^{\dagger}; 846 \rho)$, thereby the Gauss-Newton method is well defined. Also 847 \begin{align} 848 \|\vec{p}\;^{k+1} - \vec{p}\;^{\dagger}\| 849 &\leq \left( \frac{C_lC_L\rho}{2} \right)^{k+1} 850 \|\vec{p}\;^{0} - \vec{p}\;^{\dagger}\| \\ 851 &\le \left( \frac{C_lC_L\rho}{2} \right)^{k+1} \rho, 852 \end{align} 853 where $\frac{C_lC_L\rho}{2} < 1$, meaning that the Gauss-Newton method 854 converges to $0$ as $k\to \infty$, quadratically. \qed 855 \end{myproof} 856 857 \begin{myproof}{Lemma \ref{lemma: immersion}} 858 \label{proof: lem-immersion} 859 \begin{itemize} 860 \item Because of the differentiability assumptions of $\sigma$, for 861 all fixed $\vec{p}$, we have that $D\Psi[\vec{p}\;] \in 862 L^{2}\left([0,1]^{n}\right)$. So the Conjecture \ref{conj: main} directly 863 implies that $\mathcal{R}(D\Psi[\vec{p}\;])$ is a linear subspace of 864 $\mathbf{X}$ of dimension $n_*$. 865 \item Note that the operator $D^{2}\Psi[\vec{p}\;]: \mathbb{R}^{n_*} \to 866 L^{2}([0, 1]^{n})$ is continuous (assuming that the activation function 867 $\sigma$ is twice differentiable. Considering a nonempty neighborhood 868 $\mathcal{U}$ of $\vec{p}$ with a compact closure, the continuity of 869 $D^{2}\Psi$ w.r.t $\vec{p}$ gives that $D\Psi$ is 870 Fr\'echet-differentiable with Lipschitz-continuous derivative on 871 $\mathcal{U}$. Meaning there exist constants $C_l, C_L > 0$ such that 872 for all $\vec{q}, \vec{p} \in \mathcal{D}(\Psi)$ it holds that 873 \begin{align} 874 &\big\|\Psi'(\vec{p}\;) - \Psi'(\vec{q}\;) 875 \big\|_{\vec{P}\to\mathbf{X}} \leq C_L \|\vec{p} - \vec{q} \| 876 \\ 877 &\big\|\Psi'(\vec{p}) \big\|_{\vec{P} \to \mathbf{X}} \le 878 C_l. 879 \end{align} 880 \qed 881 \end{itemize} 882 883 \end{myproof} 884 885 886 887 888 889 \nocite{kaltenbacher2008} 890 \nocite{frischauf2022universal} 891 \printbibliography 892 893 \end{document}