1) $F( x) =\frac{1}{x}\int_{0}^{x}\!f(t) \ dt \ $ where $f\!:\mathbb{R}\rightarrow{}\mathbb{R}$ is convex.
$\begin{array}{l} \text{if we set} \ t\ =\ xy\text{, and } x=\theta u +( 1-\theta ) v\ \text{ then,}\\ \ F( \theta u+( 1-\theta ) v) =\int _{0}^{1} f(( \theta u+( 1-\theta ) v) y) \ dy\\ \underset{\text{Jensen's}}{\leqslant } \\ \int _{0}^{1}( \theta f( uy) +( 1-\theta ) f( vy)) \ dy\ =\ \theta F( u) +( 1-\theta ) F( v)\\ F( \theta u+( 1-\theta ) v) \ \leqslant \ \theta F( u) +( 1-\theta ) F( v) \ \therefore \ F \ \text{is convex by definition.} \end{array}$
2) $\displaystyle f_{\theta }( x) =\theta ^{-1} x^{\theta } -\theta ^{-1}$ with $\displaystyle \text{dom} \ f_{\theta } =\mathbb{R}_{+}$ and $\displaystyle 0< \theta \leqslant 1$.
$\begin{array}{l} f_{\theta }^{''}( x) =( \theta -1) x^{\theta -2}\text{, since} \ f_{\theta }^{''}( x) \leqslant 0\text{, } f_{\theta } \ \text{is not convex.} \end{array}$
3) $\displaystyle f( x_{1} ,x_{2}) =\frac{x_{1}^{2}}{x_{2}}$ and $\displaystyle x_{2} >0$
$\begin{array}{l} \text{for } \ \mathbf{H}_{f} =\begin{bmatrix} \frac{2}{x_{2}} & -\frac{2x_{1}}{x_{2}^{2}}\\ -\frac{2x_{1}}{x_{2}^{2}} & \frac{2x_{1}^{2}}{x_{2}^{3}} \end{bmatrix}\text{, since} \ \frac{2}{x_{2}} >0\text{, and }\frac{2x_{1}^{2}}{x_{2}^{3}} \geqslant 0\ \text{ for } x_{2} >0\\ \text{the Hessian matrix of f is positive semidefinite.} \therefore \ f\ \text{ is convex.} \end{array}$
4) $\displaystyle f( x_{1} ,x_{2} ,x_{3}) =-e^{-x_{1}} +x_{2} x_{3}^{2}$
$\begin{array}{l} \text{for } \ \mathbf{H}_{f} =\begin{bmatrix} -e^{-x_{1}} & 0 & 0\\ 0 & 0 & 2x_{3}\\ 0 & 2x_{3} & 2x_{2} \end{bmatrix}\text{, and } g^{T}\mathbf{H} g=6x_{2} x_{3}^{2} -e^{-x_{1}} x_{1}^{2}\\ \text{the Hessian matrix of f is not positive semidefinite. } \ \therefore \ f\ \text{ is not convex.} \end{array}$
Likelihood function $F$ where $c$ and $k$ are positive numbers, and $\displaystyle y_{i}$ can be found in "Topic2.csv" is defined as: \begin{align*} L( k,c;\ y_{1} ,...,y_{n}) &=\prod _{i=1}^{n}\frac{k\ c\ y_{i}^{c-1}}{\left( 1+y_{i}^{c}\right)^{k+1}} \\ &=( k\ c)^{n}\prod _{i=1}^{n}\frac{y_{i}^{c-1}}{\left( 1+y_{i}^{c}\right)^{k+1}} \end{align*}
a) log-likelihood:
\begin{array}{l} \log( L( k,c;\ y_{i})) \ =\ \log\left(( k\ c)^{n}\right) +\sum _{i=1}^{n}\frac{\log\left( y_{i}^{c-1}\right)}{\log\left(\left( 1+y_{i}^{c}\right)^{k+1}\right)}\\ \ell ( k,c;\ y_{i}) =\ n\log( k\ c) +( c-1)\sum _{i=1}^{n}\log\left( y_{i}\right) -( k+1)\sum _{i=1}^{n}\log\left( 1+y_{i}^{c}\right) \end{array}b) Maximum Likelihood Estimate:
\begin{equation*} \text{for} \ \theta =\{k,c\}\text{, }\hat{\theta }_{MLE} =\underset{\theta }{\arg\max} L( y_{i} |\theta ) =\underset{\theta }{\arg\max} \ell ( y_{i} |\theta ) \end{equation*}c) Gradient:
\begin{array}{l} \frac{\partial }{\partial k} \ell ( k,c;\ y_{i}) =\frac{n}{k} -\sum _{i=1}^{n}\log\left( 1+y_{i}^{c}\right)\\ \begin{align*} \frac{\partial }{\partial c} \ell ( k,c;\ y_{i}) &= \frac{n}{c} +\sum _{i=1}^{n}\log\left( y_{i}\right) -( k+1)\sum _{i=1}^{n}\frac{y_{i}^{c}\log( y_{i})}{1+y_{i}^{c}} \\ &= \frac{n}{c} +\sum _{i=1}^{n}\frac{\left( 1-k\ y_{i}^{c}\right)\log( y_{i})}{y_{i}^{c} +1} \end{align*} \end{array}Hessian:
\begin{equation*} \mathbf{H}_{\ell _{\{k,c\}}} =\begin{bmatrix} \frac{\partial ^{2} \ell }{\partial k^{2}} & \frac{\partial ^{2} \ell }{\partial k\partial c}\\ \frac{\partial ^{2} \ell }{\partial c\partial k} & \frac{\partial ^{2} \ell }{\partial c^{2}} \end{bmatrix} =\begin{bmatrix} -\frac{n}{k^{2}} & -\sum _{i=1}^{n}\frac{y_{i}^{c}\log( y_{i})}{y_{i}^{c} +1}\\ -\sum _{i=1}^{n}\frac{y_{i}^{c}\log( y_{i})}{y_{i}^{c} +1} & -\frac{n}{c^{2}} -( k+1)\sum _{i=1}^{n}\frac{y_{i}^{c}\log^{2}( y_{i})}{\left( y_{i}^{c} +1\right)^{2}} \end{bmatrix} \end{equation*}import cmath as cm
import numpy as np
import matplotlib.pyplot as plt
y = np.loadtxt('Topic2.csv', delimiter=',')
def ell(kc, y):
k, c, n = kc[0], kc[1], y.shape[0]
return n * np.real(cm.log(k * c)) + (c - 1) * np.sum(np.log(y)) - (k + 1) * np.sum(np.log(1 + y ** c))
def grad_kc(kc, y):
k, c, n = kc[0], kc[1], y.shape[0]
grad_k = (n / k) - np.sum(np.log(1 + y ** c))
grad_c = (n / c) + np.sum(((1 - k * y ** c) * np.log(y)) / (y ** c + 1))
return np.array([grad_k, grad_c])
def hess_kc(kc, y):
k, c, n = kc[0], kc[1], y.shape[0]
k_gg = -(n/k**2)
c_gg = -(n/c**2)-(k+1)*(np.sum((y**c)*(np.log(y**c)**2)/(y**c+1)**2))
kc_gg = -np.sum((y**c)*np.log(y**c)/(y**c+1))
return np.array([[k_gg, kc_gg], [kc_gg, c_gg]])
def plot(kc_list, conv_list, f_list):
kc_mle = np.round(kc_list[-1], 3).tolist()
print('Results converged in', f_list.size, 'iterations.',
'\nk^ MLE:', kc_mle[0], '\nc^ MLE: ', kc_mle[1],
'\nlog-likelihood:', np.round(f_list[-1],2))
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
fig.suptitle('Plots of Params vs Number of Iterations', fontsize=16)
ax[0, 0].plot(kc_list[:, 0]), ax[0, 0].set_title(r'$\hat{k}$ estimates')
ax[0, 1].plot(kc_list[:, 1]), ax[0, 1].set_title(r'$\hat{c}$ estimates')
ax[1, 0].plot(f_list), ax[1, 0].set_title(r'$\ell(c,k;y_i)$')
ax[1, 1].plot(conv_list), ax[1, 1].set_title('Convergence')
plt.tight_layout()
plt.show()
def gDwBt(y, alpha=1, beta=0.1, max_iter=500, tol=1e-5, rs=0):
np.random.seed(rs)
kc0 = np.random.randn(2)
ell0 = ell(kc0, y)
grad_ = grad_kc(kc0, y)
conv_l = [grad_ @ grad_]
kc_l = [kc0]
f_l = [ell0]
for i in range(max_iter):
kc = kc0 + alpha * grad_kc(kc0, y)
ell_new = ell(kc, y)
while ell_new < ell0:
alpha = beta * alpha
kc = kc0 + alpha * grad_
ell_new = ell(kc, y)
kc0 = kc + alpha * grad_
ell0 = ell(kc0, y)
grad_ = grad_kc(kc0, y)
conv_l.append(grad_ @ grad_)
kc_l.append(kc0)
f_l.append(ell0)
if np.abs(conv_l[-1]) < tol:
break
return np.array(kc_l), np.array(conv_l), np.array(f_l)
kc_list, conv_list, f_list = gDwBt(y)
plot(kc_list, conv_list, f_list)
Results converged in 401 iterations. k^ MLE: 8.948 c^ MLE: 3.556 log-likelihood: 795.46
def agDwBt(y, alpha=0.001, beta=0.01, max_iter=500, tol=1e-5, rs=0):
np.random.seed(rs)
kc0 = np.random.randn(2)
kc_2 = np.random.randn(2)
ell0 = ell(kc0, y)
kc = kc0
grad_ = grad_kc(kc0, y)
conv_l = [grad_ @ grad_]
kc_l = [kc0]
f_l = [ell0]
k=1
for i in range(max_iter):
ell_new = ell(kc, y)
while ell_new < ell0:
alpha = alpha/beta
kc_ = kc_2 + alpha * grad_
ell_new = ell(kc_, y)
kc0 = kc_2 + alpha * grad_
kc_2 = kc0 + (k-1)/(k+2) * (kc0 - kc)
ell0 = ell(kc0, y)
grad_ = grad_kc(kc_2, y)
kc = kc0
k+=1
conv_l.append(grad_ @ grad_)
kc_l.append(kc0)
f_l.append(ell0)
if np.abs(conv_l[-1]) < tol:
break
return np.array(kc_l), np.array(conv_l), np.array(f_l)
kc_list, conv_list, f_list = agDwBt(y)
plot(kc_list, conv_list, f_list)
Results converged in 151 iterations. k^ MLE: 8.945 c^ MLE: 3.556 log-likelihood: 795.46
(also includes convergence mean change test)
def sgDwBt(y, alpha=1, beta=0.2, bsize=0.2, max_iter=500, tol=1e-4, rs=0):
np.random.seed(rs)
cap = int(bsize * len(y))
kc0 = np.random.randn(2)
y_ = np.random.choice(y, cap, replace=False)
ell0 = ell(kc0, y_)
grad_ = grad_kc(kc0, y_)
conv_l = [grad_ @ grad_]
kc_l = [kc0]
f_l = [ell0]
for i in range(max_iter):
kc = kc0 + alpha * grad_kc(kc0, y_)
ell_new = ell(kc, y_)
while ell_new < ell0 and alpha > 0.009:
alpha = alpha * beta
kc = kc0 + alpha * grad_
ell_new = ell(kc, y_)+alpha
kc0 = kc + alpha * grad_
ell0 = ell(kc0, y_)
grad_ = grad_kc(kc0, y_)
y_ = np.random.choice(y, cap, replace=False)
conv_l.append(grad_ @ grad_)
kc_l.append(kc0)
f_l.append(ell0)
if len(conv_l) > (max_iter/10 + 15):
changecontrol = np.abs(np.mean(conv_l[-11:-1]) - np.mean(conv_l[-16:-5]))/1e3
if changecontrol < tol:
break
kc_l.append(np.mean(kc_list[-5:-1], axis=0))
f_l.append(np.mean(f_l[-5:-1]))
conv_l.append(np.mean(conv_l[-11:-1]))
return np.array(kc_l), np.array(conv_l), np.array(f_l)*(bsize*100)
kc_list, conv_list, f_list = sgDwBt(y, alpha=0.1, beta=0.5, bsize=0.1)
plot(kc_list, conv_list, f_list)
Results converged in 219 iterations. k^ MLE: 8.938 c^ MLE: 3.555 log-likelihood: 809.55
# newton's method with line search
def newton(y, beta=0.9, max_iter=500, tol=1e-6):
kc0 = np.array([0.5,0.5])
ell0 = ell(kc0, y)
grad_ = grad_kc(kc0, y)
hess_inv = np.linalg.inv(hess_kc(kc0, y))
conv_l = [grad_ @ grad_]
kc_l = [kc0]
f_l = [ell0]
alph_ = 1
for i in range(max_iter):
kc = kc0 - alph_ * hess_inv @ grad_
ell_new = ell(kc, y)
while ell_new < ell0:
alph_ = beta * alph_
kc = kc0 - alph_ * hess_inv @ grad_
ell_new = ell(kc, y)
kc0 = kc - alph_ * hess_inv @ grad_
ell0 = ell(kc0, y)
grad_ = grad_kc(kc0, y)
hess_inv = np.linalg.inv(hess_kc(kc0, y))
conv_l.append(grad_ @ grad_)
kc_l.append(kc0)
f_l.append(ell0)
if np.log1p(np.abs(conv_l[-1]-conv_l[-2])) < tol:
break
return np.array(kc_l), np.array(conv_l), np.array(f_l)
kc_list, conv_list, f_list = newton(y)
plot(kc_list, conv_list, f_list)
Results converged in 197 iterations. k^ MLE: 8.948 c^ MLE: 3.557 log-likelihood: 795.46
Minimizing the following loss function using Gauss-Newton method:
\begin{equation*} L( \beta ) =\underset{\beta _{1} ,\beta _{2}}{min}\sum _{i=1}^{n}\left( y_{i} -\frac{y_{1} \beta _{1}}{y_{1} +( \beta _{1} -y_{1})\exp( -\beta _{2} d_{i})}\right)^{2} \end{equation*}Topic3.csv
contains $d_i$ (first column) and $y_i$ (second column). Our goal is to estimate $\displaystyle \hat{\beta } =\begin{bmatrix}
\hat{\beta }_{1} & \hat{\beta }_{2}
\end{bmatrix}^{T}$.
To decompose $\displaystyle L( \beta )$ as $\displaystyle L( \beta ) =g^{T}( \beta ) g( \beta )$, we set $\displaystyle g( \beta ) =\left[\overset{\rightharpoonup }{y} -\frac{y_{1} \beta _{1}}{y_{1} +( \beta _{1} -y_{1})\exp\left( -\beta _{2}\overset{\rightharpoonup }{d}\right)}\right]$.
The Jacobian matrix $\displaystyle \mathbf{J}_{g}( \beta _{1} ,\beta _{2}) =\begin{bmatrix} \frac{\partial g_{1...n}( \beta )}{\partial \beta _{1}} & \frac{\partial g_{1...n}( \beta )}{\partial \beta _{2}} \end{bmatrix}$ is derived as:
\begin{align*} \mathbf{J}_{g} (\beta _{1} ,\beta _{2} ) & =\begin{bmatrix} -y_{1}\frac{\left( y_{1} +(\beta _{1} -y_{1} )e^{-\beta _{2} d}\right) -e^{-\beta _{2} d} \beta _{1}}{\left( y_{1} +(\beta _{1} -y)e^{-\beta _{2} d}\right)^{2}} & y_{1} \beta _{1}\frac{-de^{-d\beta _{2}} (\beta _{1} -y_{1} )}{\left( y_{1} +(\beta _{1} -y_{1} )e^{-\beta _{2} d}\right)^{2}} \end{bmatrix}\\ & =\begin{bmatrix} \frac{y_{1}^{2}\left( 1-e^{\beta _{2} d}\right) e^{\beta _{2} d}}{\left( \beta _{1} +y_{1} e^{\beta _{2} d} -y_{1}\right)^{2}} & \frac{y_{1} \beta _{1} d(y_{1} -\beta _{1} )e^{\beta _{2} d}}{\left( \beta _{1} +y_{1} e^{\beta _{2} d} -y_{1}\right)^{2}} \end{bmatrix} \end{align*}import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 100
dat = np.loadtxt('Topic3.csv', delimiter=',')
d, y = dat[:,0], dat[:,1]
def f(beta):
b1, b2, y1 = beta[0], beta[1], y[0]
return np.sum((y-(y1*b1)/(y1+(b1-y1)*np.exp(-b2*d)))**2)
def g(beta):
b1, b2, y1 = beta[0], beta[1], y[0]
return np.array([y-(y1*b1)/(y1+(b1-y1)*np.exp(-b2*d))]).T
def J(beta):
b1, b2, y1 = beta[0], beta[1], y[0]
J1 = np.exp(b2*d)*(1-np.exp(b2*d))*y1**2/(b1+y1*np.exp(b2*d)-y1)**2
J2 = np.exp(b2*d)*y1*b1*d*(y1-b1)/(b1+y1*np.exp(b2*d)-y1)**2
return np.c_[J1, J2]
beta0 = np.array([1, 1])
alpha, tol, max_iter=1, 1e-9, 500
g0 = g(beta0)
J0 = J(beta0)
f0 = f(beta0)
f_list = [f0]
Omega = -np.linalg.solve(J0.T@J0 +10*np.eye(2), J0.T@g0).T
while np.abs(Omega).max() > tol and len(f_list) < max_iter:
beta_ = beta0 + alpha*Omega
f_ = f(beta_.T)
while f_ > f0:
alpha = alpha*0.1
beta_ = beta0 - alpha*Omega
f_ = f(beta_.T)
beta0 = beta_
g0 = g(beta0.T)
J0 = J(beta0.T)
f0 = f_
f_list.append(f0)
Omega = -np.linalg.solve(J0.T@J0 +10*np.eye(2), J0.T@g0).T
y_pred = (y[0]*beta_.T[0])/(y[0]+(beta_.T[0]-y[0])*np.exp(-beta_.T[1]*d))
plt.figure(figsize=(8,6))
cap = r'GN converged in %s iters for tol = 1e-9 estimating ' %(len(f_list)-1)
beta_str = r'$\hat{\beta}_1$ = %2.3f and $\hat{\beta}_2$ = %2.3f'\
%(beta_.T[0],beta_.T[1])
plt.plot(d, y_pred, label=r'Predicted $y$'), plt.plot(d, y, label=r'Actual $y$')
plt.suptitle(cap + beta_str, size=11)
plt.title(r'Plot of $y$ and $y_{1}\beta_{1}/(y_{1}+'
r'(\beta_{1}-y_{1})\exp(-\beta_{2}d_{i}))$', size=14)
_= plt.legend(), plt.show()
Considering the following optimization problem:
\begin{equation*} \underset{x}{\min}\frac{1}{2} x^{T} Px+q^{T} x+\frac{\lambda }{2}\Vert z\Vert _{2}^{2} \ \quad \ s.t.\quad \ \begin{cases} x=z\\ a\leqslant z\leqslant b \end{cases} \ \quad \text{where} \ P\in \mathbb{R}^{n\times n} \ \text{and} \ a,b,x,q\in \mathbb{R}^{n} \end{equation*}For the scaled form of the ADMM, $\mathcal{L}( x,z,w;\rho ) =f( x) +g( z) +\frac{\rho }{2}\Vert Ax+Bz-c+w\Vert _{2}^{2} +\frac{\rho }{2}\Vert w\Vert _{2}^{2}$ with $w=\frac{\mathcal{u}}{\rho}$ we have:
Now we can derive the gradients to get rid of $\arg\underset{\{x,\ z\}}{\min}$ in the update equations:
Implementation:
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
# load data
data_mat = sio.loadmat('Topic4.mat')
P, q, a, b = data_mat['P'], data_mat['q'], data_mat['a'].T, data_mat['b'].T
lam, rho = 0.5, 1.1
max_iter = 50
tol = 1e-9 # tolerance for convergence
# Fully initialize x, z, and w for faster computation (instead of .append)
x = np.zeros((max_iter+1, a.size))
z = np.ones((max_iter+1, a.size)) * np.mean(np.r_[a, b], 0)
w = np.zeros((max_iter+1, a.size))
f_x = np.zeros(max_iter)
res = np.zeros(max_iter)
f_x_f = lambda X: 0.5 * X.T @ P @ X + q.T @ X
P_x = np.linalg.inv(P+P.T+rho)
G_z = rho/(lam+rho)
# ADMM
for i in range(max_iter):
x[i+1] = 2 * (rho * (z[i] - w[i])-q.T) @ P_x
z[i+1] = np.minimum(b, np.maximum(a, (x[i+1] + w[i]) * G_z))
w[i+1] = w[i] + x[i+1] - z[i+1]
f_x[i] = f_x_f(x[i+1])
res[i] = np.linalg.norm(x[i+1]-z[i+1])
if res[i] < tol: break
print('Algorithm converged in {} iterations with tolerance of {}.'.format(i+1, tol))
print('f(x) at convergence:', np.round(f_x[i],3))
Algorithm converged in 44 iterations with tolerance of 1e-09. f(x) at convergence: 6.902
plt.figure(figsize=(10,5)), plt.subplot(1,2,1), plt.plot(f_x[1:i+1])
plt.title(r'$f(x)$ vs. number of iterations'), plt.subplot(1,2,2)
plt.plot(res[1:i+1]), plt.title(r'$\Vert{x-z}\Vert_{2}$ vs. number of iterations')
plt.figure(figsize=(10,3)), plt.stem(x[i]), plt.title(r'Plot of $X$s at Convergence')
plt.show()
Considering the following (slightly modified) optimization of Ridge Regression penalized sum of squares, we solve for optimal $w_j$ using coordinate descent:
\begin{equation*} \begin{aligned} \text{for: } & \\ \underset{w}{\min F_{( w)}} & =\underset{w}{\min}\frac{1}{2}\Vert y-Xw\Vert _{2}^{2} +\frac{\lambda }{2}\Vert w\Vert _{2}^{2}\\ \text{we have: } & \\ F_{( w)} & =\frac{1}{2}\Vert y -\sum _{j=1}^{p} x_{j} w_{j}\Vert ^{2} +\frac{\lambda }{2}\sum _{j=1}^{p} w_{j}^{2} \ \\ \frac{\partial F_{( w)}}{\partial w_{j}} & =\left(\frac{1}{2}\right)( -2x_{j})\left( y -\sum _{k\neq j}^{p} w_{k} x_{k} -x_{j} w_{j}\right) +\left(\frac{\lambda }{2}\right) 2w_{j}\\ \text{If we set: } & r_{-j} =y -\sum _{k\neq j}^{p} w_{k} x_{k}\\ \frac{\partial F_{( w)}}{\partial w} & =-x_{j}( r_{-j} -w_{j} x_{j}) +\lambda w_{j} =0\ \ \ \rightarrow \\ w_{j} & =\frac{x_{j}^{T} r_{-j}}{x_{j}^{T} x_{j} +\lambda }\\ \text{and if we set: } & \tilde{x}_{j} =\frac{x_{j}}{\Vert x_{j}\Vert } \ \quad \text{we'll have:} \ \tilde{x}_{j}^{T}\tilde{x}_{j} =1\ \\ w_{j} & =\frac{\tilde{x}_{j}^{T} r_{-j}}{1+\lambda } ,\quad \text{where} \ r_{-j} =y -\sum _{k\neq j}^{p} w_{k}\tilde{x}_{k} \end{aligned} \end{equation*}Considering the following optimization, we solve for optimal $w_j$ using coordinate descent:
\begin{equation*} \begin{aligned} \text{for: } & \\ \underset{w}{\min F_{( w)}} & =\underset{w}{\min}\frac{1}{2}\Vert y-Xw\Vert _{2}^{2} +\lambda _{1}| w| _{1} +\frac{\lambda _{2}}{2}\Vert w\Vert _{2}^{2}\\ \text{we have: } & \\ F_{( w)} & =\frac{1}{2}\Vert y -\sum _{j=1}^{p} x_{j} w_{j}\Vert ^{2} +\frac{\lambda _{2}}{2}\sum _{j=1}^{p} w_{j}^{2} \ +\lambda _{1}\sum _{j=1}^{p}| w_{j}| \\ \frac{\partial F_{( w)}}{\partial w_{j}} & =-x_{j}\left( y -\sum _{k\neq j}^{p} w_{k} x_{k} -x_{j} w_{j}\right) +\lambda _{2} w_{j} +\lambda _{1}\text{sgn}( w_{j})\\ \text{If we set: } & r_{-j} =y -\sum _{k\neq j}^{p} w_{k} x_{k}\\ \frac{\partial F_{( w)}}{\partial w} & =-x_{j}( r_{-j} -w_{j} x_{j}) +\lambda _{2} w_{j} +\lambda _{1}\text{sgn}( w_{j}) =0\ \ \ \rightarrow \\ w_{j} & =\frac{x_{j}^{T} r_{-j} -\lambda _{1}\text{sgn}( w_{j})}{x_{j}^{T} x_{j} +\lambda _{2}} =\frac{\tilde{x}_{j}^{T} r_{-j} -\lambda _{1}\text{sgn}( w_{j})}{1+\lambda _{2}} \end{aligned} \end{equation*}Here we apply the regression algorithms we derived in the previous sections to train a model for predicting performance decay over time of the Gas Turbine compressor.
import numpy as np
from numpy.linalg import norm
from numpy import delete as sans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
data = np.loadtxt('Topic5.csv', delimiter=',')
X_train, y_train = data[:2000, :-1], data[:2000, -1]
X_test, y_test = data[2000:, :-1], data[2000:, -1]
# Regularization, Normalization and added column of ones for intercept
scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train)
X_test2 = scaler.transform(X_test)
X_train2 = np.c_[np.ones(X_train2.shape[0]), X_train]
X_test2 = np.c_[np.ones(X_test2.shape[0]), X_test]
X_norms = norm(X_train2, axis=0)
X_norms = np.where(X_norms == 0, 1, X_norms)
X_train_ = X_train2 / X_norms # Normalization
X_test_ = X_test2 / X_norms
res_ = lambda W: norm(y_train - X_train_ @ W)**2
r_j_ = lambda J, W: y_train - sans(X_train_, J, axis=1) @ sans(W, J)
def coordescent(wfunc, max_iter = 500):
w = np.log1p(norm(X_train_, axis=0)**0.5) # instead of a seed
res = np.zeros(max_iter)
for i in range(max_iter):
for j in range(w.size):
r_j = r_j_(j, w)
w[j] = wfunc(r_j, j, w)
res[i] = res_(w)
return w, res
With predefined requirements:
lam = 0.1
w_p1_ = lambda R, J, W: (X_train_[:,J].T @ R) / (1+lam)
w_p1, res_p1 = coordescent(w_p1_, max_iter=300)
plt.plot(res_p1), plt.yscale('log')
plt.title(r'$||y-Xw||_{2}^{2}$ vs. # of Iterations')
plt.xlabel(r'SSR of Final $w$:%2.3f' %(res_p1[-1]))
plt.show()
print('Values of \'w\' at Convergence:')
print(np.round(w_p1,2))
Values of 'w' at Convergence: [ 9.83 0.72 0.62 -2.37 3.66 7.45 -2.99 5.82 7.86 2.12 2.38 9.61 -1.42 -1.52]
SAE_tst_p1 = np.sum(np.abs(y_test - X_test_ @ w_p1))
print('SAE of the Test Set: %3.3f' % SAE_tst_p1)
SAE of the Test Set: 16.383
With some optimizations:
lam = 0.01
w_p1_ = lambda R, J, W: (X_train_[:,J].T @ R) / (1+lam)
w_p1, res_p1 = coordescent(w_p1_, max_iter=4) # Catching the first dip.
plt.plot(res_p1), plt.yscale('log')
plt.title(r'$||y-Xw||_{2}^{2}$ vs. # of Iterations')
plt.xlabel(r'SSR of Final $w$:%2.3f' %(res_p1[-1]))
plt.show()
print('Values of \'w\' at Convergence:')
print(np.round(w_p1,2))
Values of 'w' at Convergence: [35.57 -0.41 0.62 -0.41 2.32 1.81 -1.89 2.44 1.18 -0.37 0.72 2. -1.1 0.63]
SAE_tst_p1 = np.sum(np.abs(y_test - X_test_ @ w_p1))
print('SAE of the Test Set: %3.3f' % SAE_tst_p1)
SAE of the Test Set: 5.217
With predefined requirements:
lam1 = 0.05
lam2 = 0.01
w_p2_ = lambda R, J, W: (X_train_[:,J].T @ R - lam1*np.sign(W[J])) / (1+lam2)
w_p2, res_p2 = coordescent(w_p2_, max_iter=400)
plt.plot(res_p2), plt.yscale('log')
plt.title(r'$||y-Xw||_{2}^{2}$ vs. # of Iterations')
plt.xlabel(r'SSR of Final $w$:%2.3f' %(res_p2[-1]))
plt.show()
print('Values of \'w\' at Convergence:')
print(np.round(w_p2,decimals=2))
[ 1.178e+01 -1.000e-02 -5.300e-01 -3.840e+00 3.000e-02 5.760e+00 -8.300e-01 5.680e+00 1.051e+01 2.000e-01 2.800e-01 1.358e+01 1.000e-02 1.000e-02]
SAE_tst_p2 = np.sum(np.abs(y_test - X_test_ @ w_p2))
print('SAE of the Test Set: %3.3f' % SAE_tst_p2)
SAE of the Test Set: 6.858
With some optimizations:
lam1 = 0.01
lam2 = 0.005
w_p2_ = lambda R, J, W: (X_train_[:,J].T @ R - lam1*np.sign(W[J])) / (1+lam2)
w_p2, res_p2 = coordescent(w_p2_, max_iter=5)
plt.plot(res_p2)
plt.title(r'$||y-Xw||_{2}^{2}$ vs. # of Iterations')
plt.xlabel(r'SSR of Final $w$:%2.3f' %(res_p2[-1]))
plt.show()
print('Values of \'w\' at Convergence:')
print(np.round(w_p2,decimals=2))
[36.08 -0.8 0.55 -0.3 2.2 1.76 -1.76 2.34 1.15 -0.29 0.68 1.97 -1.01 0.57]
SAE_tst_p2 = np.sum(np.abs(y_test - X_test_ @ w_p2))
print('SAE of the Test Set: %3.3f' % SAE_tst_p2)
SAE of the Test Set: 4.994
import numpy as np
from numpy import maximum as mxm, minimum as mnm
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
data = np.loadtxt('Topic6.csv', delimiter=',')
X_trn, y_trn = data[:1000, :-1], data[:1000, -1]
X_tst, y_tst = data[1000:, :-1], data[1000:, -1]
grad_inr = lambda x, y, thet: (x*np.exp(x*thet))/(1 + np.exp(x*thet))-x*y
predict = lambda x, thet, lamt: (1/(1 + np.exp(-x@thet)))>lamt
def prox_grad(X_train, y_train, lam1, lam2, t, max_iter=10):
acc_l = np.zeros(max_iter+1)
theta = np.zeros(X_train.shape[1])
acc_l[0] = accuracy_score(y_train, predict(X_train, theta, lam1*t))
for i in range(max_iter):
grad = np.array([
grad_inr(X_train[j], y_train[j], theta) for j in
range(y_train.size)]).sum(axis=0)/y_train.size + lam2*theta
theta_ = theta - t*grad
theta = mxm(theta_-t*lam1,0) + mnm(theta_+t*lam1,0)
acc_l[i+1] = accuracy_score(y_train, predict(X_train, theta, lam1*t))
print('Total non-negative weights:')
print(np.sum(theta!=0))
return theta, acc_l
%%time
lam1 = 10
lam2 = 5
t = 2e-5
theta, acc_l = prox_grad(X_trn, y_trn, lam1, lam2, t, max_iter=40)
Total non-negative weights: 253 CPU times: total: 6.75 s Wall time: 886 ms
plt.figure(figsize=(10,4)), plt.plot(acc_l), plt.tight_layout()
plt.title('Plot of Accuracy of Training vs. # of Iterations')
plt.show()
y_tst_pred = predict(X_tst, theta, lam1*t)
test_acc = accuracy_score(y_tst, y_tst_pred)
print('Our model predicted the test set labels with',
test_acc*100,'percent accuracy.\n '
'(Misclassified only',(y_tst_pred!=y_tst).sum(),'element(s).)')
Our model predicted Test Set labels with 99.9 percent accuracy. (Misclassified only 1 element(s).)