{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Usage" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import the library" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from tefs import TEFS\n", "%config InlineBackend.figure_format = 'retina'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generate some random data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.random.seed(0)\n", "n = 1000 # number of samples\n", "m = 15 # number of features\n", "\n", "data = {}\n", "for i in range(1, m+1):\n", " data[f'x{i}'] = np.random.normal(size=n)\n", "\n", "data['y'] = sum(data.values()) + np.random.normal(size=n)\n", "\n", "data = pd.DataFrame(data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Train and test split." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_test = int(0.20 * n)\n", "n_train = n - n_test\n", "data_test = data[n_train:]\n", "data = data[:n_train]\n", "\n", "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define features and target" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "target_name = \"y\"\n", "features = data.drop(columns=[target_name])\n", "target = data[target_name]\n", "features_names = list(features.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configuration\n", "\n", "Set the hyperparameters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k = n_train // 10 # Set k using the rule of thumb\n", "direction = 'forward' # or 'backward'\n", "lag_features = [0,1]\n", "lag_target = [1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run the algorithm\n", "\n", "Perform feature selection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fs = TEFS(\n", " features=features.values,\n", " target=target.values,\n", " k=k,\n", " lag_features=lag_features,\n", " lag_target=lag_target,\n", " direction=direction,\n", " verbose=2,\n", " var_names=features_names,\n", " n_jobs=4,\n", ")\n", "\n", "fs.fit()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Inspect the results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots()\n", "fs.plot_te_results(ax=ax)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Select features based on threshold" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "selected_features = fs.select_features(threshold=np.inf)\n", "n_features_selected_with_threshold = len(selected_features)\n", "selected_features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Select specific number of features" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "selected_features = fs.select_n_features(n=4)\n", "selected_features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Wrapper analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from tefs.metrics import regression_analysis\n", "from sklearn.model_selection import TimeSeriesSplit\n", "\n", "num_total_features = features.shape[1]\n", "scores = []\n", "scores_cv = []\n", "unified_df = pd.concat([data, data_test], axis=0).reset_index(drop=True)\n", "n_samples = unified_df.shape[0]\n", "n_splits = 5\n", "# Fixed-size rolling window\n", "cv_scheme = TimeSeriesSplit(\n", " n_splits=n_splits,\n", " max_train_size=n_samples // (n_splits + 1),\n", ")\n", "\n", "for n_features in range(0, num_total_features+1):\n", "\n", " selected_features = fs.select_n_features(n=n_features)\n", " inputs_names_lags = {feature: lag_features for feature in selected_features}\n", " inputs_names_lags[target_name] = lag_target\n", "\n", " # Train-test scores\n", " scores.append(regression_analysis(\n", " inputs_names_lags=inputs_names_lags,\n", " target_name=target_name,\n", " df_train=data,\n", " df_test=data_test,\n", " ))\n", "\n", " # Cross-validation scores\n", " scores_cv.append(regression_analysis(\n", " inputs_names_lags=inputs_names_lags,\n", " target_name=target_name,\n", " df=unified_df,\n", " cv_scheme=cv_scheme,\n", " ))\n", "\n", "scores = np.array(scores)\n", "scores_cv = np.array(scores_cv)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### With train and test split" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(10, 5))\n", "\n", "ax.plot(scores, marker=\"o\", label=\"Fixed train-test\")\n", "maxima = np.where(scores == scores.max())[0]\n", "ax.plot(maxima, scores[maxima], marker=\"o\", color=\"red\", linestyle=\"None\", label=\"Maximum\", markersize=10)\n", "ax.plot(n_features_selected_with_threshold, scores[n_features_selected_with_threshold], marker=\"o\", color=\"green\", linestyle=\"None\", label=\"TEFS (conservative)\", markersize=10)\n", "\n", "ax.legend()\n", "\n", "ax.set_xlabel(\"Number of features\")\n", "ax.set_ylabel(\"Test R2\")\n", "ax.set_title(\"Wrapper TEFS\")\n", "step = 2\n", "ax.set_xticks(range(0, num_total_features+1, step))\n", "ax.set_xticklabels(range(0, num_total_features+1, step))\n", "ax.set_ylim(-0.1, 1)\n", "ax.grid(visible=True)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### With cross-validation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import scipy.stats\n", "\n", "fig, ax = plt.subplots(figsize=(10, 5))\n", "\n", "ax.plot(scores_cv.mean(axis=1), marker=\"o\", label=\"Cross-validation\")\n", "maxima = np.where(scores_cv.mean(axis=1) == scores_cv.mean(axis=1).max())[0]\n", "ax.plot(maxima, scores_cv.mean(axis=1)[maxima], marker=\"o\", color=\"red\", linestyle=\"None\", label=\"Maximum\", markersize=10)\n", "ax.plot(n_features_selected_with_threshold, scores_cv.mean(axis=1)[n_features_selected_with_threshold], marker=\"o\", color=\"green\", linestyle=\"None\", label=\"TEFS (conservative)\", markersize=10)\n", "\n", "ax.legend()\n", "\n", "# plot confidence interval bands from cross-validation based on mean and standard deviation (90% confidence)\n", "alpha = 0.1\n", "quantile = scipy.stats.norm.ppf(1-alpha/2)\n", "ax.fill_between(\n", " range(scores_cv.shape[0]),\n", " scores_cv.mean(axis=1) - scores_cv.std(axis=1) * quantile / np.sqrt(scores_cv.shape[1]),\n", " scores_cv.mean(axis=1) + scores_cv.std(axis=1) * quantile / np.sqrt(scores_cv.shape[1]),\n", " alpha=0.3\n", ")\n", "\n", "ax.set_xlabel(\"Number of features\")\n", "ax.set_ylabel(\"Test R2\")\n", "ax.set_title(\"Wrapper TEFS\")\n", "step = 2\n", "ax.set_xticks(range(0, num_total_features+1, step))\n", "ax.set_xticklabels(range(0, num_total_features+1, step))\n", "ax.set_ylim(-0.1, 1)\n", "ax.grid(visible=True)\n", "\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 2 }