{ "cells": [ { "cell_type": "markdown", "id": "liquid-rental", "metadata": {}, "source": [ "## Background\n", "\n", "Dataset taken from https://archive.ics.uci.edu/ml/datasets/Contraceptive+Method+Choice" ] }, { "cell_type": "markdown", "id": "proved-chemical", "metadata": {}, "source": [ "## Loading data" ] }, { "cell_type": "code", "execution_count": 1, "id": "daily-springfield", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wife_agewife_eduhusb_educhildrenwife_relwife_iswhusb_ocusol_indexmedia_expmethod_used
024233112301
1451310113401
243237113401
342329113301
436338113201
\n", "
" ], "text/plain": [ " wife_age wife_edu husb_edu children wife_rel wife_isw husb_ocu \\\n", "0 24 2 3 3 1 1 2 \n", "1 45 1 3 10 1 1 3 \n", "2 43 2 3 7 1 1 3 \n", "3 42 3 2 9 1 1 3 \n", "4 36 3 3 8 1 1 3 \n", "\n", " sol_index media_exp method_used \n", "0 3 0 1 \n", "1 4 0 1 \n", "2 4 0 1 \n", "3 3 0 1 \n", "4 2 0 1 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "column_names = [\n", " 'wife_age',\n", " 'wife_edu',\n", " 'husb_edu',\n", " 'children',\n", " 'wife_rel',\n", " 'wife_isw',\n", " 'husb_ocu',\n", " 'sol_index',\n", " 'media_exp',\n", " 'method_used'\n", "]\n", "\n", "df = pd.read_csv('cmc.data', names=column_names)\n", "df.head()" ] }, { "cell_type": "markdown", "id": "anonymous-portland", "metadata": {}, "source": [ "## Preparing Data" ] }, { "cell_type": "code", "execution_count": 2, "id": "warming-sacramento", "metadata": {}, "outputs": [], "source": [ "features = [col for col in column_names if col != 'method_used']\n", "\n", "X = df[features]\n", "y = df['method_used']" ] }, { "cell_type": "markdown", "id": "found-feature", "metadata": {}, "source": [ "### Scaling" ] }, { "cell_type": "markdown", "id": "private-paradise", "metadata": {}, "source": [ "I'm using a StandardScaler" ] }, { "cell_type": "code", "execution_count": 3, "id": "eleven-hello", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", "\n", "scaler = StandardScaler()\n", "X = scaler.fit_transform(X)" ] }, { "cell_type": "markdown", "id": "premium-cooperative", "metadata": {}, "source": [ "### Feature selection\n", "First I'd like to see how balanced are the target classes" ] }, { "cell_type": "code", "execution_count": 4, "id": "respiratory-heart", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
method_usedmedia_exp
01629
12333
23511
\n", "
" ], "text/plain": [ " method_used media_exp\n", "0 1 629\n", "1 2 333\n", "2 3 511" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[:, ['media_exp', 'method_used']]\\\n", " .groupby('method_used')\\\n", " .count()\\\n", " .reset_index()" ] }, { "cell_type": "code", "execution_count": 5, "id": "renewable-tracy", "metadata": {}, "outputs": [], "source": [ "# Install a pip package in the current Jupyter kernel\n", "#import sys\n", "#!{sys.executable} -m pip install -U imbalanced-learn" ] }, { "cell_type": "code", "execution_count": 6, "id": "every-yahoo", "metadata": {}, "outputs": [], "source": [ "from imblearn.over_sampling import SMOTE\n", "\n", "smote = SMOTE(sampling_strategy='all')\n", "X, y = smote.fit_resample(X, y)" ] }, { "cell_type": "markdown", "id": "intimate-hazard", "metadata": {}, "source": [ "Now I'm using SelectKBest method to get the best 5 features" ] }, { "cell_type": "code", "execution_count": 7, "id": "married-personality", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Top 9 features: ['wife_age' 'wife_edu' 'husb_edu' 'children' 'wife_rel' 'wife_isw'\n", " 'husb_ocu' 'sol_index' 'media_exp']\n" ] } ], "source": [ "from sklearn.feature_selection import SelectKBest, f_classif\n", "\n", "N = 9\n", "fs = SelectKBest(f_classif, k=N)\n", "X_selected = fs.fit_transform(X, y)\n", "cols = fs.get_support(indices=True)\n", "df_improved = df.iloc[:, cols]\n", "feature_names = df.iloc[:, cols].columns.values\n", "\n", "print(\"Top {} features: {}\".format(N, feature_names))" ] }, { "cell_type": "markdown", "id": "protected-vulnerability", "metadata": {}, "source": [ "### Create training/test sets" ] }, { "cell_type": "code", "execution_count": 8, "id": "played-engineering", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)" ] }, { "cell_type": "markdown", "id": "continuing-strategy", "metadata": {}, "source": [ "## Model creation" ] }, { "cell_type": "code", "execution_count": 9, "id": "roman-ecuador", "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.ensemble import AdaBoostClassifier\n", "from sklearn.svm import LinearSVC\n", "\n", "classifiers = [\n", " {\n", " 'classifier': SVC(),\n", " 'params': {\n", " 'C': [1, 5, 10, 20, 30, 40]\n", " }\n", " },\n", " {\n", " 'classifier': LogisticRegression(),\n", " 'params': {\n", " 'C': [1, 5, 10, 20, 30, 40],\n", " 'solver': ['newton-cg', 'saga']\n", " }\n", " },\n", " {\n", " 'classifier': KNeighborsClassifier(),\n", " 'params': {\n", " 'n_neighbors': [5, 10, 15, 20]\n", " }\n", " },\n", " {\n", " 'classifier': DecisionTreeClassifier(),\n", " 'params': {\n", " 'max_depth': [3, 4, 5, 6]\n", " }\n", " },\n", " {\n", " 'classifier': RandomForestClassifier(),\n", " 'params': {\n", " 'max_depth': [3, 4, 5, 6] \n", " }\n", " },\n", " {\n", " 'classifier': AdaBoostClassifier(),\n", " 'params': {\n", " 'n_estimators': [50, 60, 75, 100]\n", " }\n", " }\n", "] " ] }, { "cell_type": "code", "execution_count": 10, "id": "moral-consistency", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", "def resolve_best_params(classifier_entry, X_param, y_param):\n", " clsf_instance = classifier_entry['classifier']\n", " clsf_params = classifier_entry['params']\n", " grid_search = GridSearchCV(\n", " clsf_instance, \n", " cv=5,\n", " param_grid=clsf_params, \n", " # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter\n", " scoring='precision_macro')\n", " grid_result = grid_search.fit(X_param, y_param)\n", " \n", " return grid_result.best_params_" ] }, { "cell_type": "code", "execution_count": 11, "id": "specified-electric", "metadata": { "scrolled": true }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from sklearn.metrics import confusion_matrix\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "f, ax = plt.subplots(2, 3, figsize=(15, 10))\n", "cols = 0\n", "rows = 0\n", "\n", "for clsf in classifiers: \n", " best_params = resolve_best_params(clsf, X, y)\n", " classifier = clsf['classifier'].set_params(**best_params).fit(X, y)\n", " clsf_name = type(classifier).__name__\n", " \n", " y_predicted = classifier.predict(X_test)\n", " matrix = confusion_matrix(y_test, y_predicted)\n", " dataframe = pd.DataFrame(matrix, index=[1, 2, 3], columns=[1, 2, 3])\n", "\n", " ax[rows, cols].title.set_text(clsf_name)\n", " sns.heatmap(dataframe, cbar=False, square=True, annot=True, fmt='g',ax=ax[rows, cols])\n", " if cols < 2:\n", " cols += 1\n", " else:\n", " cols = 0\n", " rows +=1\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 12, "id": "informational-beauty", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "SVC -- {'C': 40}\n", "-------------------------------------------------------\n", "\n", " precision recall f1-score support\n", "\n", " 1 0.92 0.76 0.83 161\n", " 2 0.72 0.88 0.79 161\n", " 3 0.76 0.72 0.74 150\n", "\n", " accuracy 0.79 472\n", " macro avg 0.80 0.79 0.79 472\n", "weighted avg 0.80 0.79 0.79 472\n", "\n", "\n", "\n", "LogisticRegression -- {'C': 1, 'solver': 'newton-cg'}\n", "-------------------------------------------------------\n", "\n", " precision recall f1-score support\n", "\n", " 1 0.58 0.55 0.57 161\n", " 2 0.56 0.61 0.59 161\n", " 3 0.45 0.43 0.44 150\n", "\n", " accuracy 0.53 472\n", " macro avg 0.53 0.53 0.53 472\n", "weighted avg 0.53 0.53 0.53 472\n", "\n", "\n", "\n", "KNeighborsClassifier -- {'n_neighbors': 5}\n", "-------------------------------------------------------\n", "\n", " precision recall f1-score support\n", "\n", " 1 0.73 0.65 0.68 161\n", " 2 0.68 0.86 0.76 161\n", " 3 0.70 0.58 0.63 150\n", "\n", " accuracy 0.70 472\n", " macro avg 0.70 0.69 0.69 472\n", "weighted avg 0.70 0.70 0.69 472\n", "\n", "\n", "\n", "DecisionTreeClassifier -- {'max_depth': 6}\n", "-------------------------------------------------------\n", "\n", " precision recall f1-score support\n", "\n", " 1 0.65 0.75 0.70 161\n", " 2 0.71 0.68 0.69 161\n", " 3 0.63 0.55 0.59 150\n", "\n", " accuracy 0.66 472\n", " macro avg 0.66 0.66 0.66 472\n", "weighted avg 0.66 0.66 0.66 472\n", "\n", "\n", "\n", "RandomForestClassifier -- {'max_depth': 6}\n", "-------------------------------------------------------\n", "\n", " precision recall f1-score support\n", "\n", " 1 0.79 0.65 0.71 161\n", " 2 0.66 0.76 0.71 161\n", " 3 0.60 0.62 0.61 150\n", "\n", " accuracy 0.68 472\n", " macro avg 0.68 0.68 0.68 472\n", "weighted avg 0.69 0.68 0.68 472\n", "\n", "\n", "\n", "AdaBoostClassifier -- {'n_estimators': 75}\n", "-------------------------------------------------------\n", "\n", " precision recall f1-score support\n", "\n", " 1 0.72 0.60 0.66 161\n", " 2 0.63 0.70 0.66 161\n", " 3 0.55 0.58 0.56 150\n", "\n", " accuracy 0.63 472\n", " macro avg 0.63 0.63 0.63 472\n", "weighted avg 0.64 0.63 0.63 472\n", "\n", "\n" ] } ], "source": [ "from sklearn.metrics import classification_report\n", "\n", "for clsf in classifiers: \n", " best_params = resolve_best_params(clsf, X, y)\n", " classifier = clsf['classifier'].set_params(**best_params).fit(X, y)\n", " clsf_name = type(classifier).__name__\n", " y_predicted = classifier.predict(X_test)\n", " report_result = classification_report(y_test, y_predicted)\n", " \n", " print()\n", " print(\"{} -- {}\".format(clsf_name, best_params))\n", " print('-------------------------------------------------------')\n", " print()\n", " print(report_result)\n", " print()" ] }, { "cell_type": "code", "execution_count": null, "id": "classical-series", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 5 }