{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageyearaux_nodesstatus
0306411
1306231
2306501
3315921
4316541
\n", "
" ], "text/plain": [ " age year aux_nodes status\n", "0 30 64 1 1\n", "1 30 62 3 1\n", "2 30 65 0 1\n", "3 31 59 2 1\n", "4 31 65 4 1" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"haberman.data\", names=['age', 'year', 'aux_nodes', 'status'])\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5974025974025974" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.svm import SVC\n", "\n", "features = [col for col in df.columns.values if col != 'status']\n", "\n", "X = df[features]\n", "y = df['status']\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)\n", "\n", "svc = SVC().fit(X_train, y_train)\n", "svc.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "0.5974025974025974" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.dummy import DummyClassifier\n", "\n", "dummy = DummyClassifier(strategy = \"most_frequent\").fit(X_train, y_train)\n", "dummy.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statuscountpct
012250.735294
12810.264706
\n", "
" ], "text/plain": [ " status count pct\n", "0 1 225 0.735294\n", "1 2 81 0.264706" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def show_balance(dataframe):\n", " rates_df = dataframe[['status', 'age']]\\\n", " .copy()\\\n", " .groupby('status')\\\n", " .count()\\\n", " .rename(columns={'age': 'count'})\\\n", " .reset_index(drop=False)\n", " rates_df['pct'] = rates_df['count'] / rates_df['count'].sum()\n", " \n", " return rates_df\n", " \n", "show_balance(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 1 = the patient survived 5 years or longer\n", "- 2 = the patient died within 5 year" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/imbalanced-learn/\u001b[0m\n", "\u001b[33mWARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/imbalanced-learn/\u001b[0m\n", "\u001b[33mWARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/imbalanced-learn/\u001b[0m\n", "Requirement already up-to-date: imbalanced-learn in /opt/conda/lib/python3.8/site-packages (0.7.0)\n", "Requirement already satisfied, skipping upgrade: joblib>=0.11 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (0.16.0)\n", "Requirement already satisfied, skipping upgrade: scipy>=0.19.1 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (1.5.2)\n", "Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (1.19.1)\n", "Requirement already satisfied, skipping upgrade: scikit-learn>=0.23 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (0.23.2)\n", "Requirement already satisfied, skipping upgrade: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from scikit-learn>=0.23->imbalanced-learn) (2.1.0)\n" ] } ], "source": [ "# Install a pip package in the current Jupyter kernel\n", "# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/\n", "import sys\n", "!{sys.executable} -m pip install -U imbalanced-learn" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from imblearn.over_sampling import SMOTE\n", " \n", "smote = SMOTE()\n", "X_resampled, y_resampled = smote.fit_resample(X, y)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statuscountpct
012250.5
122250.5
\n", "
" ], "text/plain": [ " status count pct\n", "0 1 225 0.5\n", "1 2 225 0.5" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df = X_resampled.copy()\n", "new_df['status'] = y_resampled.copy()\n", "\n", "show_balance(new_df)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.48672566371681414" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.dummy import DummyClassifier\n", "\n", "dummy = DummyClassifier(strategy = \"most_frequent\").fit(X_train, y_train)\n", "dummy.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5663716814159292" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.svm import SVC\n", "\n", "svc = SVC().fit(X_train, y_train)\n", "svc.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model tuning" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6363636363636364" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.svm import SVC\n", "\n", "# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html\n", "svc = SVC(class_weight='balanced').fit(X_train, y_train)\n", "svc.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6493506493506493" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "\n", "# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n", "logistic = LogisticRegression(class_weight='balanced').fit(X_train, y_train)\n", "logistic.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6493506493506493" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "\n", "# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n", "weights = {1: 0.27, 2: 0.73}\n", "logistic = LogisticRegression(class_weight=weights).fit(X_train, y_train)\n", "logistic.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }