{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>year</th>\n",
       "      <th>aux_nodes</th>\n",
       "      <th>status</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>30</td>\n",
       "      <td>64</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>30</td>\n",
       "      <td>62</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>30</td>\n",
       "      <td>65</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>31</td>\n",
       "      <td>59</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>31</td>\n",
       "      <td>65</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age  year  aux_nodes  status\n",
       "0   30    64          1       1\n",
       "1   30    62          3       1\n",
       "2   30    65          0       1\n",
       "3   31    59          2       1\n",
       "4   31    65          4       1"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"haberman.data\", names=['age', 'year', 'aux_nodes', 'status'])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5974025974025974"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "features = [col for col in df.columns.values if col != 'status']\n",
    "\n",
    "X = df[features]\n",
    "y = df['status']\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)\n",
    "\n",
    "svc = SVC().fit(X_train, y_train)\n",
    "svc.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5974025974025974"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.dummy import DummyClassifier\n",
    "\n",
    "dummy = DummyClassifier(strategy = \"most_frequent\").fit(X_train, y_train)\n",
    "dummy.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>status</th>\n",
       "      <th>count</th>\n",
       "      <th>pct</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>225</td>\n",
       "      <td>0.735294</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>81</td>\n",
       "      <td>0.264706</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   status  count       pct\n",
       "0       1    225  0.735294\n",
       "1       2     81  0.264706"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def show_balance(dataframe):\n",
    "    rates_df = dataframe[['status', 'age']]\\\n",
    "        .copy()\\\n",
    "        .groupby('status')\\\n",
    "        .count()\\\n",
    "        .rename(columns={'age': 'count'})\\\n",
    "        .reset_index(drop=False)\n",
    "    rates_df['pct'] = rates_df['count'] / rates_df['count'].sum()\n",
    "    \n",
    "    return rates_df\n",
    "        \n",
    "show_balance(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 1 = the patient survived 5 years or longer\n",
    "- 2 = the patient died within 5 year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[33mWARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7fb79ec2d850>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/imbalanced-learn/\u001b[0m\n",
      "\u001b[33mWARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7fb79ec76640>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/imbalanced-learn/\u001b[0m\n",
      "\u001b[33mWARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7fb79ec76850>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/imbalanced-learn/\u001b[0m\n",
      "Requirement already up-to-date: imbalanced-learn in /opt/conda/lib/python3.8/site-packages (0.7.0)\n",
      "Requirement already satisfied, skipping upgrade: joblib>=0.11 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (0.16.0)\n",
      "Requirement already satisfied, skipping upgrade: scipy>=0.19.1 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (1.5.2)\n",
      "Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (1.19.1)\n",
      "Requirement already satisfied, skipping upgrade: scikit-learn>=0.23 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (0.23.2)\n",
      "Requirement already satisfied, skipping upgrade: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from scikit-learn>=0.23->imbalanced-learn) (2.1.0)\n"
     ]
    }
   ],
   "source": [
    "# Install a pip package in the current Jupyter kernel\n",
    "# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/\n",
    "import sys\n",
    "!{sys.executable} -m pip install -U imbalanced-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from imblearn.over_sampling import SMOTE\n",
    "    \n",
    "smote = SMOTE()\n",
    "X_resampled, y_resampled = smote.fit_resample(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>status</th>\n",
       "      <th>count</th>\n",
       "      <th>pct</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>225</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>225</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   status  count  pct\n",
       "0       1    225  0.5\n",
       "1       2    225  0.5"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_df = X_resampled.copy()\n",
    "new_df['status'] = y_resampled.copy()\n",
    "\n",
    "show_balance(new_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.48672566371681414"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.dummy import DummyClassifier\n",
    "\n",
    "dummy = DummyClassifier(strategy = \"most_frequent\").fit(X_train, y_train)\n",
    "dummy.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5663716814159292"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.svm import SVC\n",
    "\n",
    "svc = SVC().fit(X_train, y_train)\n",
    "svc.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6363636363636364"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.svm import SVC\n",
    "\n",
    "# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html\n",
    "svc = SVC(class_weight='balanced').fit(X_train, y_train)\n",
    "svc.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6493506493506493"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n",
    "logistic = LogisticRegression(class_weight='balanced').fit(X_train, y_train)\n",
    "logistic.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6493506493506493"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n",
    "weights  = {1: 0.27, 2: 0.73}\n",
    "logistic = LogisticRegression(class_weight=weights).fit(X_train, y_train)\n",
    "logistic.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}