{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" year | \n",
" aux_nodes | \n",
" status | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 30 | \n",
" 64 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 30 | \n",
" 62 | \n",
" 3 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 30 | \n",
" 65 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 31 | \n",
" 59 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 31 | \n",
" 65 | \n",
" 4 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age year aux_nodes status\n",
"0 30 64 1 1\n",
"1 30 62 3 1\n",
"2 30 65 0 1\n",
"3 31 59 2 1\n",
"4 31 65 4 1"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"haberman.data\", names=['age', 'year', 'aux_nodes', 'status'])\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5974025974025974"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.svm import SVC\n",
"\n",
"features = [col for col in df.columns.values if col != 'status']\n",
"\n",
"X = df[features]\n",
"y = df['status']\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)\n",
"\n",
"svc = SVC().fit(X_train, y_train)\n",
"svc.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"0.5974025974025974"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.dummy import DummyClassifier\n",
"\n",
"dummy = DummyClassifier(strategy = \"most_frequent\").fit(X_train, y_train)\n",
"dummy.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" status | \n",
" count | \n",
" pct | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 225 | \n",
" 0.735294 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 81 | \n",
" 0.264706 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" status count pct\n",
"0 1 225 0.735294\n",
"1 2 81 0.264706"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def show_balance(dataframe):\n",
" rates_df = dataframe[['status', 'age']]\\\n",
" .copy()\\\n",
" .groupby('status')\\\n",
" .count()\\\n",
" .rename(columns={'age': 'count'})\\\n",
" .reset_index(drop=False)\n",
" rates_df['pct'] = rates_df['count'] / rates_df['count'].sum()\n",
" \n",
" return rates_df\n",
" \n",
"show_balance(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- 1 = the patient survived 5 years or longer\n",
"- 2 = the patient died within 5 year"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mWARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/imbalanced-learn/\u001b[0m\n",
"\u001b[33mWARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/imbalanced-learn/\u001b[0m\n",
"\u001b[33mWARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/imbalanced-learn/\u001b[0m\n",
"Requirement already up-to-date: imbalanced-learn in /opt/conda/lib/python3.8/site-packages (0.7.0)\n",
"Requirement already satisfied, skipping upgrade: joblib>=0.11 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (0.16.0)\n",
"Requirement already satisfied, skipping upgrade: scipy>=0.19.1 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (1.5.2)\n",
"Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (1.19.1)\n",
"Requirement already satisfied, skipping upgrade: scikit-learn>=0.23 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (0.23.2)\n",
"Requirement already satisfied, skipping upgrade: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from scikit-learn>=0.23->imbalanced-learn) (2.1.0)\n"
]
}
],
"source": [
"# Install a pip package in the current Jupyter kernel\n",
"# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/\n",
"import sys\n",
"!{sys.executable} -m pip install -U imbalanced-learn"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from imblearn.over_sampling import SMOTE\n",
" \n",
"smote = SMOTE()\n",
"X_resampled, y_resampled = smote.fit_resample(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" status | \n",
" count | \n",
" pct | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 225 | \n",
" 0.5 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 225 | \n",
" 0.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" status count pct\n",
"0 1 225 0.5\n",
"1 2 225 0.5"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_df = X_resampled.copy()\n",
"new_df['status'] = y_resampled.copy()\n",
"\n",
"show_balance(new_df)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.48672566371681414"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.dummy import DummyClassifier\n",
"\n",
"dummy = DummyClassifier(strategy = \"most_frequent\").fit(X_train, y_train)\n",
"dummy.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5663716814159292"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.svm import SVC\n",
"\n",
"svc = SVC().fit(X_train, y_train)\n",
"svc.score(X_test, y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model tuning"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6363636363636364"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.svm import SVC\n",
"\n",
"# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html\n",
"svc = SVC(class_weight='balanced').fit(X_train, y_train)\n",
"svc.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6493506493506493"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n",
"logistic = LogisticRegression(class_weight='balanced').fit(X_train, y_train)\n",
"logistic.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6493506493506493"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n",
"weights = {1: 0.27, 2: 0.73}\n",
"logistic = LogisticRegression(class_weight=weights).fit(X_train, y_train)\n",
"logistic.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}