# Usage

Import the library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tefs import TEFS
%config InlineBackend.figure_format = 'retina'

Generate some random data

In [None]:
np.random.seed(0)
n = 1000 # number of samples
m = 15 # number of features

data = {}
for i in range(1, m+1):
 data[f'x{i}'] = np.random.normal(size=n)

data['y'] = sum(data.values()) + np.random.normal(size=n)

data = pd.DataFrame(data)

Train and test split.

In [None]:
n_test = int(0.20 * n)
n_train = n - n_test
data_test = data[n_train:]
data = data[:n_train]

data.head()

Define features and target

In [None]:
target_name = "y"
features = data.drop(columns=[target_name])
target = data[target_name]
features_names = list(features.columns)

## Configuration

Set the hyperparameters

In [None]:
k = n_train // 10 # Set k using the rule of thumb
direction = 'forward' # or 'backward'
lag_features = [0,1]
lag_target = [1]

## Run the algorithm

Perform feature selection

In [None]:
fs = TEFS(
 features=features.values,
 target=target.values,
 k=k,
 lag_features=lag_features,
 lag_target=lag_target,
 direction=direction,
 verbose=2,
 var_names=features_names,
 n_jobs=4,
)

fs.fit()

## Inspect the results

In [None]:
fig, ax = plt.subplots()
fs.plot_te_results(ax=ax)
plt.show()

Select features based on threshold

In [None]:
selected_features = fs.select_features(threshold=np.inf)
n_features_selected_with_threshold = len(selected_features)
selected_features

Select specific number of features

In [None]:
selected_features = fs.select_n_features(n=4)
selected_features

## Wrapper analysis

In [None]:
from tefs.metrics import regression_analysis
from sklearn.model_selection import TimeSeriesSplit

num_total_features = features.shape[1]
scores = []
scores_cv = []
unified_df = pd.concat([data, data_test], axis=0).reset_index(drop=True)
n_samples = unified_df.shape[0]
n_splits = 5
# Fixed-size rolling window
cv_scheme = TimeSeriesSplit(
 n_splits=n_splits,
 max_train_size=n_samples // (n_splits + 1),
)

for n_features in range(0, num_total_features+1):

 selected_features = fs.select_n_features(n=n_features)
 inputs_names_lags = {feature: lag_features for feature in selected_features}
 inputs_names_lags[target_name] = lag_target

 # Train-test scores
 scores.append(regression_analysis(
 inputs_names_lags=inputs_names_lags,
 target_name=target_name,
 df_train=data,
 df_test=data_test,
 ))

 # Cross-validation scores
 scores_cv.append(regression_analysis(
 inputs_names_lags=inputs_names_lags,
 target_name=target_name,
 df=unified_df,
 cv_scheme=cv_scheme,
 ))

scores = np.array(scores)
scores_cv = np.array(scores_cv)

### With train and test split

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

ax.plot(scores, marker="o", label="Fixed train-test")
maxima = np.where(scores == scores.max())[0]
ax.plot(maxima, scores[maxima], marker="o", color="red", linestyle="None", label="Maximum", markersize=10)
ax.plot(n_features_selected_with_threshold, scores[n_features_selected_with_threshold], marker="o", color="green", linestyle="None", label="TEFS (conservative)", markersize=10)

ax.legend()

ax.set_xlabel("Number of features")
ax.set_ylabel("Test R2")
ax.set_title("Wrapper TEFS")
step = 2
ax.set_xticks(range(0, num_total_features+1, step))
ax.set_xticklabels(range(0, num_total_features+1, step))
ax.set_ylim(-0.1, 1)
ax.grid(visible=True)

plt.show()

### With cross-validation

In [None]:
import scipy.stats

fig, ax = plt.subplots(figsize=(10, 5))

ax.plot(scores_cv.mean(axis=1), marker="o", label="Cross-validation")
maxima = np.where(scores_cv.mean(axis=1) == scores_cv.mean(axis=1).max())[0]
ax.plot(maxima, scores_cv.mean(axis=1)[maxima], marker="o", color="red", linestyle="None", label="Maximum", markersize=10)
ax.plot(n_features_selected_with_threshold, scores_cv.mean(axis=1)[n_features_selected_with_threshold], marker="o", color="green", linestyle="None", label="TEFS (conservative)", markersize=10)

ax.legend()

# plot confidence interval bands from cross-validation based on mean and standard deviation (90% confidence)
alpha = 0.1
quantile = scipy.stats.norm.ppf(1-alpha/2)
ax.fill_between(
 range(scores_cv.shape[0]),
 scores_cv.mean(axis=1) - scores_cv.std(axis=1) * quantile / np.sqrt(scores_cv.shape[1]),
 scores_cv.mean(axis=1) + scores_cv.std(axis=1) * quantile / np.sqrt(scores_cv.shape[1]),
 alpha=0.3
)

ax.set_xlabel("Number of features")
ax.set_ylabel("Test R2")
ax.set_title("Wrapper TEFS")
step = 2
ax.set_xticks(range(0, num_total_features+1, step))
ax.set_xticklabels(range(0, num_total_features+1, step))
ax.set_ylim(-0.1, 1)
ax.grid(visible=True)

plt.show()