import numpy as np
import pandas as pd

A = np.array([[1,0,0,0,0,0,0,0],
              [0,0,0,0,1,0,0,0],
              [1,1,1,1,0,0,0,0],
              [0,0,0,0,1,3,9,27],
              [0,1,2,3,0,-1,0,0],
              [0,0,1,3,0,0,-1,0],
              [0,0,1,0,0,0,0,0],
              [0,0,0,0,0,0,1,9]])

B = np.array([1,0.25,0.25,0.04,0,0,0,0])
X = np.linalg.inv(A).dot(B)
coeff = pd.DataFrame(X.reshape(2,4),
                     index=['1','2'],
                     columns=['a','b','c','d'])
print(coeff.round(3))

      a      b      c      d
1  1.00 -0.835  0.000  0.085
2  0.25 -0.580  0.255 -0.028


import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 120
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.family'] = 'serif'
plt.rcParams['mathtext.fontset'] = 'stix'

def f(x):
    return 1/x**2
def S_1(x):
    return coeff.loc['1','a']+coeff.loc['1','b']*(x-1)+\
           coeff.loc['1','c']*(x-1)**2+coeff.loc['1','d']*(x-1)**3
def S_2(x):
    return coeff.loc['2','a']+coeff.loc['2','b']*(x-2)+\
           coeff.loc['2','c']*(x-2)**2+coeff.loc['2','d']*(x-2)**3

x = np.linspace(0.9,5.1,100)
X_1 = np.linspace(1,2,100)
X_2 = np.linspace(2,5,100)

plt.figure(figsize=(6,4))
plt.plot(x,f(x),label=r'$f\/(x)$')
plt.plot(X_1,S_1(X_1),label=r'$S\/(x=[1,2])$')
plt.plot(X_2,S_2(X_2),label=r'$S\/(x=[2,5])$')
plt.xlim(0.9,5.1)
plt.ylim(-0.21,1.1)
knots = np.array([1,2,5])
plt.plot(knots, f(knots), 'ro', label='Knots')
plt.legend()
plt.tight_layout()


# Loading libraries and setting default parameters
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedKFold
from scipy.interpolate import CubicSpline, splrep, splev, LSQUnivariateSpline
from scipy.signal import convolve

plt.rcParams['figure.dpi'] = 150


# Loading The Data
x = np.arange(0,1000)
y = np.mean(np.loadtxt('Question3.csv', delimiter=','), axis=0)


# 5-Fold Cross Validation of Signal Data with Natural Cubic Spline Interpolation for number of knots:
kf = RepeatedKFold(n_splits=5, n_repeats=20, random_state=42)


# 5-Fold Cross Validation function for Cubic B-Spline Interpolation
def kcv_NCS(kf, x, y, knots):
    MSE = []
    for train_index, test_index in kf.split(x):
        xs = np.linspace(0,train_index.size,knots).astype(int)[1:-1]
        train_index = np.sort(train_index)
        tck = LSQUnivariateSpline(train_index, y[train_index], k=3, t=xs)
        y_pred = tck(test_index)
        MSE.append(np.mean((y_pred - y[test_index]) ** 2))
    return np.mean(MSE)


# 5-Fold Cross Validation function for Cubic B-Spline Interpolation
def kcv_CBS(kf, x, y, knots):
    MSE = []
    for train_index, test_index in kf.split(x):
        xs = np.linspace(0,train_index.size,knots).astype(int)[1:-1]
        train_index = np.sort(train_index)
        # xs = train_index[xs]
        tck = splrep(train_index, y[train_index], k=3, s=0, t=xs)
        y_pred = splev(test_index, tck)
        MSE.append(np.mean((y_pred - y[test_index]) ** 2))
    return np.mean(MSE)


MSE_NCS = []
params_NCS = np.arange(5,51)
for knot in params_NCS:
    MSE_NCS.append(kcv_NCS(kf, x, y, knot))


MSE_CBS = []
params_CBS = np.arange(5,51)
for knot in params_CBS:
    MSE_CBS.append(kcv_CBS(kf, x, y, knot))


# Plot of MSE of Natural Cubic Spline and Cubic B-Spline for number of knots
plt.plot(params_NCS, MSE_NCS, label='Natural Cubic Spline')
plt.plot(params_CBS, MSE_CBS, '-.', label='Cubic B-Spline')
plt.xlim(4.3,50.5)
plt.xlabel('Number of knots')
plt.ylabel('MSE')
plt.title('MSE of Natural Cubic Spline and Cubic B-Spline')
plt.legend()
plt.show()


# Natural Cubic Spline Interpolation with optimal number of knots
knots = params_NCS[np.argmin(MSE_NCS)]
xs = np.linspace(0,999,knots).astype(int)
cs = CubicSpline(xs, y[xs], bc_type='natural')
y_NCS = cs(x)

# Plot of the interpolated signal data and the original signal data
plt.plot(x, y, marker='x',label='Original Signal', markersize=1, color='red',
         linewidth=0.5)
plt.plot(x, y_NCS, label='Natural Cubic Spline with ' + str(knots) + ' knots',
         color='teal', linewidth=3.2, alpha=0.8)
plt.plot([],[], ' ', label=r'Overall MSE: '+str(np.round(np.mean((y_NCS - y)**2),3)))
plt.title('Natural Cubic Spline with Optimal Number of Knots')
plt.legend()
plt.show()


# Cubic B-Spline Interpolation with optimal number of knots
knots = params_CBS[np.argmin(MSE_CBS)]
xs = np.linspace(0,999,knots).astype(int)
tck = splrep(xs, y[xs], k=3, s=0)
y_CBS = splev(x, tck)

# Plot of the interpolated signal data and the original signal data
plt.plot(x, y, marker='x',label='Original Signal', markersize=1, color='red',
         linewidth=0.5)
plt.plot(x, y_CBS, label='Cubic B-Spline with ' + str(knots) + ' knots',
         color='teal', linewidth=3.2, alpha=0.8)
plt.plot([],[], ' ', label=r'Overall MSE: '+str(np.round(np.mean((y_CBS - y)**2),3)))
plt.title('Cubic B-Spline Interpolation with Optimal Number of Knots')
plt.legend()
plt.show()


# 5-Fold Cross Validation function for Smoothing Spline Interpolation for optimal lambda
def kcv_SS(kf, x, y, lam):
    MSE = []
    for train_index, test_index in kf.split(x):
        xs = np.sort(train_index)
        ys = y[xs]
        tck = splrep(xs, ys, k=3, s=lam)
        y_pred = splev(test_index, tck)
        MSE.append(np.mean((y_pred - y[test_index]) ** 2))
    return np.mean(MSE)


MSE_SSS = []  # Averaging over 5 runs for a smoother swoosh!
for i in range(5):
    MSE_SS = []
    params_SS = np.linspace(0.9,3,20)
    for lam in params_SS:
        MSE_SS.append(kcv_SS(kf, x, y, lam))
    MSE_SSS.append(MSE_SS)


MSE_SSS = np.mean(np.array(MSE_SSS),axis=0)


# Plot of the MSE of Smoothing Spline Interpolation vs Lambda
plt.plot(params_SS, MSE_SSS)
plt.xlabel('Lambda')
plt.ylabel('MSE')
plt.title('MSE of Smoothing Spline vs Lambda Value')
plt.xlim(0.85,3.05)
plt.show()


# Smoothing Spline Interpolation with optimal lambda
lam = params_SS[np.argmin(MSE_SS)]
xs = np.sort(x)
ys = y[xs]
tck = splrep(xs, ys, k=3, s=lam)
y_SS = splev(x, tck)

# Plot of the interpolated signal data and the original signal data
plt.plot(x, y, marker='x',label='Original Signal', markersize=1, color='red',
         linewidth=0.5)
plt.plot(x, y_SS, label=r'Smoothing Spline with $\lambda=$'+ str(np.round(lam,3)),
         color='teal', linewidth=3.2, alpha=0.8)
plt.plot([],[], ' ', label=r'Overall MSE: '+str(np.round(np.mean((y_SS - y)**2),3)))
plt.title(r'Smoothing Spline Interpolation with Optimal $\lambda$')
plt.legend()
plt.show()


# Gauassian Kernel Smoothing
def gaussian_kernel(x, y, sigma):
    return np.exp(-(x - y) ** 2 / (2 * sigma ** 2)) / (np.sqrt(2 * np.pi) * sigma)
def smooth(x, y, h):
    return convolve(y, gaussian_kernel(x, x, h), mode='same') / convolve(
        np.ones_like(y), gaussian_kernel(x, x, h), mode='same')
def kern_interp(x, y, xi, h):
    return np.array([np.sum(y * gaussian_kernel(x, xi_i, h)) for xi_i in xi])

# Custom 5-Fold Cross Validation function for Gaussian Kernel Smoothing with optimal bandwidth
def kcv_GS(kf, x, y, bw):
    MSE = []
    for train_index, test_index in kf.split(x):
        xs = np.sort(train_index)
        ys = y[xs]
        y_pred = kern_interp(xs, ys, test_index, bw)
        MSE.append(np.mean((y_pred - y[test_index]) ** 2))
    return np.mean(MSE)


MSE_GS = []
params_GS = np.linspace(2,30,30)
for bw in params_GS:
    MSE_GS.append(kcv_GS(kf, x, y, bw))


# Plotting the MSE of Gaussian Kernel Smoothing vs Bandwidth
plt.plot(params_GS, MSE_GS)
plt.xlabel('Bandwidth')
plt.ylabel('MSE')
plt.title('MSE of Gaussian Kernel Smoothing vs Bandwidth')
plt.xlim(1.5,30.5)
plt.show()


# Gaussian Kernel Smoothing with optimal bandwidth
bw = params_GS[np.argmin(MSE_GS)]
y_GS = kern_interp(x, y, x, bw)

# Plot of the interpolated signal data and the original signal data
plt.plot(x, y, marker='x',label='Original Signal', markersize=1, color='red',
         linewidth=0.5)
plt.plot(x, y_GS, label=r'Gaussian Kernel with $\sigma=$'+ str(np.round(bw,3)),
         color='teal', linewidth=3.2, alpha=0.8)
plt.plot([],[], ' ', label=r'Overall MSE: '+str(np.round(np.mean((y_GS - y)**2),3)))
plt.title(r'Gaussian Kernel with Optimal $\sigma$')
plt.legend()
plt.show()


import pandas as pd
from utils_4 import SVM_bspline, SVM_fPCA, plt_cm


data = pd.read_csv('Question4.csv', header=None)
x, y = data.iloc[:, :-1], data.iloc[:, -1]


print("Cubic B-Spline with 70 knots Feature Extraction")
_, bspline_cm = SVM_bspline(x,y)
_ = plt_cm(bspline_cm)

Cubic B-Spline with 70 knots Feature Extraction
SVM Classification Report:
              precision    recall  f1-score   support

      Greece       1.00      1.00      1.00         5
       Italy       0.82      1.00      0.90         9
    Portugal       1.00      0.50      0.67         4
       Spain       0.92      0.92      0.92        12

    accuracy                           0.90        30
   macro avg       0.93      0.85      0.87        30
weighted avg       0.91      0.90      0.89        30


print("fPCA with 2 harmonics Feature Extraction")
_, fPCA_cm = SVM_fPCA(x,y,2)
_ = plt_cm(fPCA_cm)

fPCA with 2 harmonics Feature Extraction
SVM Classification Report:
              precision    recall  f1-score   support

      Greece       1.00      1.00      1.00         5
       Italy       0.75      1.00      0.86         9
    Portugal       1.00      0.25      0.40         4
       Spain       0.92      0.92      0.92        12

    accuracy                           0.87        30
   macro avg       0.92      0.79      0.79        30
weighted avg       0.89      0.87      0.84        30


print("fPCA with 5 harmonics Feature Extraction")
_, fPCA_cm = SVM_fPCA(x,y,5,16)
_ = plt_cm(fPCA_cm)

fPCA with 5 harmonics Feature Extraction
SVM Classification Report:
              precision    recall  f1-score   support

      Greece       1.00      1.00      1.00         5
       Italy       0.82      1.00      0.90         9
    Portugal       1.00      0.25      0.40         4
       Spain       0.85      0.92      0.88        12

    accuracy                           0.87        30
   macro avg       0.92      0.79      0.79        30
weighted avg       0.88      0.87      0.84        30


print("fPCA with 8 harmonics Feature Extraction")
_, fPCA_cm = SVM_fPCA(x,y,8)
_ = plt_cm(fPCA_cm)

fPCA with 8 harmonics Feature Extraction
SVM Classification Report:
              precision    recall  f1-score   support

      Greece       1.00      0.80      0.89         5
       Italy       0.90      1.00      0.95         9
    Portugal       1.00      0.75      0.86         4
       Spain       0.85      0.92      0.88        12

    accuracy                           0.90        30
   macro avg       0.94      0.87      0.89        30
weighted avg       0.91      0.90      0.90        30


print("fPCA with 10 harmonics Feature Extraction")
_, fPCA_cm = SVM_fPCA(x,y,10,15,0.3)
_ = plt_cm(fPCA_cm)

fPCA with 10 harmonics Feature Extraction
SVM Classification Report:
              precision    recall  f1-score   support

      Greece       1.00      1.00      1.00         5
       Italy       0.90      1.00      0.95         9
    Portugal       1.00      0.75      0.86         4
       Spain       0.92      0.92      0.92        12

    accuracy                           0.93        30
   macro avg       0.95      0.92      0.93        30
weighted avg       0.94      0.93      0.93        30


### Helper Functions for Q4
# Importing libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import skfda
from scipy.interpolate import splrep
from skfda.preprocessing.dim_reduction.feature_extraction import FPCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import (classification_report,
                             confusion_matrix)
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC

# %% Part 1
class_names = ['Greece', 'Italy', 'Portugal', 'Spain']

# Produces the 70-knot B-spline coefficients per data point
def bsplinec(row):
    x = np.array(row)
    knots = np.linspace(0, row.size, 68)[1:-1]
    tck = splrep(np.arange(row.size), x, k=3, t=knots)
    return tck[1][:-4]


# Custom Transformer for `bsplinec`
class BSplineC(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.apply_along_axis(bsplinec, 1, X)


# SVM Classification with Cubic B-spline Feature Extraction
def SVM_bspline(X, y, clf=SVC(kernel='rbf', C=16, gamma=0.2)):
    x_train, x_test, y_train, y_test =
    train_test_split(X, y, test_size=30, shuffle=False)

    scaler_pre = StandardScaler()
    scaler_post = StandardScaler()
    min_max_pre = MinMaxScaler()
    min_max_post = MinMaxScaler()

    pipe = Pipeline(steps=[('pre_scale', scaler_pre),
                           ('pre_minmax', min_max_pre),
                           ('bspline', BSplineC()),
                           ('post_scale', scaler_post),
                           ('post_minmax', min_max_post),
                           ('clf', clf)])

    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    print("SVM Classification Report:\n" +
          classification_report(y_test, y_pred,
                                target_names=class_names))
    return y_pred, confusion_matrix(y_test, y_pred)


# %% Part 2
# Custom Transformer for to convert a dataframe to a skfda.FDataGrid
class DfToFdat(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = pd.DataFrame(X)
        return df_to_fdat(X_)

# Function to convert a dataframe to a FPCA object
def df_to_fdat(df):
    grid_pnts = df.columns.tolist()
    data_matrix = df.values.tolist()
    return skfda.FDataGrid(data_matrix, grid_pnts)

# SVM Classification with fPCA Feature Extraction
def SVM_fPCA(x, y, n_components, c=4, gamma=0.62):
    x_train, x_test, y_train, y_test =
    train_test_split(x, y, test_size=30, shuffle=False)

    scaler_pre = StandardScaler()
    scaler_post = StandardScaler()
    min_max_post = MinMaxScaler()

    pipe = Pipeline(steps=[
        ('scaler_pre', scaler_pre),
        ('df_to_fdat', DfToFdat()),
        ('fPCA', FPCA(n_components=n_components)),
        ('scaler_post', scaler_post),
        ('min_max_post', min_max_post),
        ('SVC', SVC(kernel='rbf', C=c, gamma=gamma))])

    pipe.fit(x_train, y_train)
    y_test_pred = pipe.predict(x_test)
    print("SVM Classification Report:\n" +
          classification_report(y_test, y_test_pred,
                                target_names=class_names))
    return y_test_pred, confusion_matrix(y_test, y_test_pred)


# %% CV Grid Search
# Cross-Validation for SVM Classification with fPCA Feature Extraction
def SVM_fPCA_CV(x, y, n_components, k=5, repeats=5, param_grid=None):
    rskf = RepeatedStratifiedKFold(n_splits=k,
                                   n_repeats=repeats,
                                   random_state=42)
    pipe = Pipeline(steps=[
        ('df_to_fdat', DfToFdat()),
        ('fPCA', FPCA(n_components=n_components)),
        ('SVC', SVC(kernel='rbf', C=16, gamma=0.2))])

    scaler_pre = StandardScaler()
    scaler_post = StandardScaler()
    min_max_post = MinMaxScaler()

    pipe = Pipeline(steps=[
        ('scaler_pre', scaler_pre),
        ('df_to_fdat', DfToFdat()),
        ('fPCA', FPCA(n_components=n_components)),
        ('scaler_post', scaler_post),
        ('min_max_post', min_max_post),
        ('SVC', SVC(kernel='rbf'))])

    if param_grid is None:
        param_grid = {
            'SVC__C': [1, 2, 4, 8, 16, 32],
            'SVC__gamma': [0.38, 0.44, 0.5, 0.56, 0.62, 0.68, 0.74]
        }

    search = GridSearchCV(pipe, param_grid, cv=rskf,
                          n_jobs=-1, verbose=1)
    search.fit(x, y)

    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)
    return search

# %% Confusion Matrix Plotting Function
def plt_cm(cmat):
    plt.rc('font', size=15)

    fig, ax = plt.subplots()
    im = ax.imshow(cmat, cmap='RdPu')
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cmat.shape[1]),
           yticks=np.arange(cmat.shape[0]))
    ax.set_xticklabels(class_names)
    ax.set_yticklabels(class_names, rotation=45)
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    for i in range(cmat.shape[0]):
        for j in range(cmat.shape[1]):
            _ = ax.text(j, i, cmat[i, j], ha="center", va="center",
                        color="w" if cmat[i, j] > cmat.max() / 2.1 else "k")
    plt.title('Confusion Matrix')
    fig.tight_layout()
    plt.show()
    return ax

Topic 1 — Natural Cubic Spline¶

Topic 2 — Penalized Least Square¶

Part 1: Degree of Polynomial¶

Part 2:¶

Topic 3 — Smoothing of Soil NIRS Functional Mean Data¶

Topic 4 — Dimensionality Reduction for Olive Oil Functional Data¶

Part 1: Cubic Spline Feature Extraction¶

Part 2: Functional PCA Feature Extraction¶