In [1]:
import pandas as pd

df = pd.read_csv("haberman.data", names=['age', 'year', 'aux_nodes', 'status'])
df.head()

Unnamed: 0,age,year,aux_nodes,status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

features = [col for col in df.columns.values if col != 'status']

X = df[features]
y = df['status']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

svc = SVC().fit(X_train, y_train)
svc.score(X_test, y_test)

0.5974025974025974

In [3]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy = "most_frequent").fit(X_train, y_train)
dummy.score(X_test, y_test)

0.5974025974025974

In [4]:
def show_balance(dataframe):
    rates_df = dataframe[['status', 'age']]\
        .copy()\
        .groupby('status')\
        .count()\
        .rename(columns={'age': 'count'})\
        .reset_index(drop=False)
    rates_df['pct'] = rates_df['count'] / rates_df['count'].sum()
    
    return rates_df
        
show_balance(df)

Unnamed: 0,status,count,pct
0,1,225,0.735294
1,2,81,0.264706


- 1 = the patient survived 5 years or longer
- 2 = the patient died within 5 year

In [5]:
# Install a pip package in the current Jupyter kernel
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
import sys
!{sys.executable} -m pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in /opt/conda/lib/python3.8/site-packages (0.7.0)


In [6]:
from imblearn.over_sampling import SMOTE
    
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [7]:
new_df = X_resampled.copy()
new_df['status'] = y_resampled.copy()

show_balance(new_df)

Unnamed: 0,status,count,pct
0,1,225,0.5
1,2,225,0.5


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0)

In [9]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy = "most_frequent").fit(X_train, y_train)
dummy.score(X_test, y_test)

0.48672566371681414

In [10]:
from sklearn.svm import SVC

svc = SVC().fit(X_train, y_train)
svc.score(X_test, y_test)

0.5663716814159292

## Model tuning

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [12]:
from sklearn.svm import SVC

# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
svc = SVC(class_weight='balanced').fit(X_train, y_train)
svc.score(X_test, y_test)

0.6363636363636364

In [13]:
from sklearn.linear_model import LogisticRegression

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
logistic = LogisticRegression(class_weight='balanced').fit(X_train, y_train)
logistic.score(X_test, y_test)

0.6493506493506493

In [23]:
from sklearn.linear_model import LogisticRegression

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
weights  = {1: 0.27, 2: 0.73}
logistic = LogisticRegression(class_weight=weights).fit(X_train, y_train)
logistic.score(X_test, y_test)

0.6493506493506493