{
"cells": [
{
"cell_type": "markdown",
"id": "thermal-syndrome",
"metadata": {},
"source": [
"## DATA PREPARATION"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "basic-ethnic",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Año | \n",
" Mes | \n",
" Hora Solicitud | \n",
" Hora Intervención | \n",
" Código | \n",
" Distrito | \n",
" Hospital | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2020 | \n",
" ENERO | \n",
" 0:03:21 | \n",
" 0:08:49 | \n",
" Accidente de monopatin | \n",
" ARGANZUELA | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 2020 | \n",
" ENERO | \n",
" 0:04:03 | \n",
" 0:14:44 | \n",
" Patología obstétrica | \n",
" SAN BLAS | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 2020 | \n",
" ENERO | \n",
" 0:04:05 | \n",
" 0:10:12 | \n",
" Inconsciente sin filiar causa | \n",
" VILLA DE VALLECAS | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 2020 | \n",
" ENERO | \n",
" 0:05:09 | \n",
" 0:13:15 | \n",
" Incendio | \n",
" VALLECAS PTE. | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 2020 | \n",
" ENERO | \n",
" 0:07:43 | \n",
" 0:16:19 | \n",
" Agresión sin especificar | \n",
" VALLECAS PTE. | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Año Mes Hora Solicitud Hora Intervención \\\n",
"0 2020 ENERO 0:03:21 0:08:49 \n",
"1 2020 ENERO 0:04:03 0:14:44 \n",
"2 2020 ENERO 0:04:05 0:10:12 \n",
"3 2020 ENERO 0:05:09 0:13:15 \n",
"4 2020 ENERO 0:07:43 0:16:19 \n",
"\n",
" Código Distrito Hospital \n",
"0 Accidente de monopatin ARGANZUELA NaN \n",
"1 Patología obstétrica SAN BLAS NaN \n",
"2 Inconsciente sin filiar causa VILLA DE VALLECAS NaN \n",
"3 Incendio VALLECAS PTE. NaN \n",
"4 Agresión sin especificar VALLECAS PTE. NaN "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv('samur_2020.csv', sep=';')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "humanitarian-austin",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" YEAR | \n",
" MONTH | \n",
" HOUR_REQUEST | \n",
" HOUR_INTERVENTION | \n",
" CODE | \n",
" DISTRICT | \n",
" HOSPITAL | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2020 | \n",
" ENERO | \n",
" 0:03:21 | \n",
" 0:08:49 | \n",
" Accidente de monopatin | \n",
" ARGANZUELA | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 2020 | \n",
" ENERO | \n",
" 0:04:03 | \n",
" 0:14:44 | \n",
" Patología obstétrica | \n",
" SAN BLAS | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 2020 | \n",
" ENERO | \n",
" 0:04:05 | \n",
" 0:10:12 | \n",
" Inconsciente sin filiar causa | \n",
" VILLA DE VALLECAS | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 2020 | \n",
" ENERO | \n",
" 0:05:09 | \n",
" 0:13:15 | \n",
" Incendio | \n",
" VALLECAS PTE. | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 2020 | \n",
" ENERO | \n",
" 0:07:43 | \n",
" 0:16:19 | \n",
" Agresión sin especificar | \n",
" VALLECAS PTE. | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" YEAR MONTH HOUR_REQUEST HOUR_INTERVENTION CODE \\\n",
"0 2020 ENERO 0:03:21 0:08:49 Accidente de monopatin \n",
"1 2020 ENERO 0:04:03 0:14:44 Patología obstétrica \n",
"2 2020 ENERO 0:04:05 0:10:12 Inconsciente sin filiar causa \n",
"3 2020 ENERO 0:05:09 0:13:15 Incendio \n",
"4 2020 ENERO 0:07:43 0:16:19 Agresión sin especificar \n",
"\n",
" DISTRICT HOSPITAL \n",
"0 ARGANZUELA NaN \n",
"1 SAN BLAS NaN \n",
"2 VILLA DE VALLECAS NaN \n",
"3 VALLECAS PTE. NaN \n",
"4 VALLECAS PTE. NaN "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns = ['YEAR', 'MONTH', 'HOUR_REQUEST', 'HOUR_INTERVENTION', 'CODE', 'DISTRICT', 'HOSPITAL']\n",
"df = df.dropna(subset=['HOUR_REQUEST', 'HOUR_INTERVENTION'])\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "desperate-bacteria",
"metadata": {},
"outputs": [],
"source": [
"def time_difference(from_date, to_date):\n",
" import math\n",
" from datetime import datetime \n",
" FMT = '%H:%M:%S'\n",
" tdelta = datetime.strptime(to_date, FMT) - datetime.strptime(from_date, FMT)\n",
" solution = math.floor(tdelta.seconds / 60)\n",
" return solution\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "convenient-fraud",
"metadata": {},
"outputs": [],
"source": [
"def fix_midnight(date):\n",
" str_repr = str(date)\n",
" is_wrong = str_repr.startswith('0:')\n",
" \n",
" if (is_wrong):\n",
" hour, minutes, seconds = date.split(':')\n",
" return ':'.join(['00', minutes, seconds])\n",
" \n",
" return date"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "solid-candidate",
"metadata": {},
"outputs": [],
"source": [
"month_codes, _ = pd.factorize(df['MONTH'])\n",
"code_codes, _ = pd.factorize(df['CODE'])\n",
"district_codes, _ = pd.factorize(df['DISTRICT'])\n",
"hospital_codes, _ = pd.factorize(df['HOSPITAL'])\n",
"\n",
"df['MONTH'] = month_codes\n",
"df['CODE'] = code_codes\n",
"df['DISTRICT'] = district_codes\n",
"df['HOSPITAL'] = hospital_codes\n",
"df['HOUR_REQUEST'] = df['HOUR_REQUEST'].apply(fix_midnight)\n",
"df['HOUR_INTERVENTION'] = df['HOUR_INTERVENTION'].apply(fix_midnight)\n",
"df['MINUTES_TO_RESPOND'] = df.\\\n",
" loc[:, ['HOUR_REQUEST', 'HOUR_INTERVENTION']].\\\n",
" T.\\\n",
" apply(lambda x: time_difference(x[0], x[1]))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "latest-archives",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9.0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"\n",
"average_time_to_respond = np.floor(np.average(df['MINUTES_TO_RESPOND']))\n",
"average_time_to_respond"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ultimate-federal",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" YEAR | \n",
" MONTH | \n",
" HOUR_REQUEST | \n",
" HOUR_INTERVENTION | \n",
" CODE | \n",
" DISTRICT | \n",
" HOSPITAL | \n",
" MINUTES_TO_RESPOND | \n",
" HOUR | \n",
" ABOVE_AVG | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2020 | \n",
" 0 | \n",
" 00:03:21 | \n",
" 00:08:49 | \n",
" 0 | \n",
" 0 | \n",
" -1 | \n",
" 5 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2020 | \n",
" 0 | \n",
" 00:04:03 | \n",
" 00:14:44 | \n",
" 1 | \n",
" 1 | \n",
" -1 | \n",
" 10 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 2020 | \n",
" 0 | \n",
" 00:04:05 | \n",
" 00:10:12 | \n",
" 2 | \n",
" 2 | \n",
" -1 | \n",
" 6 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2020 | \n",
" 0 | \n",
" 00:05:09 | \n",
" 00:13:15 | \n",
" 3 | \n",
" 3 | \n",
" -1 | \n",
" 8 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 2020 | \n",
" 0 | \n",
" 00:07:43 | \n",
" 00:16:19 | \n",
" 4 | \n",
" 3 | \n",
" -1 | \n",
" 8 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" YEAR MONTH HOUR_REQUEST HOUR_INTERVENTION CODE DISTRICT HOSPITAL \\\n",
"0 2020 0 00:03:21 00:08:49 0 0 -1 \n",
"1 2020 0 00:04:03 00:14:44 1 1 -1 \n",
"2 2020 0 00:04:05 00:10:12 2 2 -1 \n",
"3 2020 0 00:05:09 00:13:15 3 3 -1 \n",
"4 2020 0 00:07:43 00:16:19 4 3 -1 \n",
"\n",
" MINUTES_TO_RESPOND HOUR ABOVE_AVG \n",
"0 5 0 0 \n",
"1 10 0 1 \n",
"2 6 0 0 \n",
"3 8 0 0 \n",
"4 8 0 0 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['HOUR'] = df['HOUR_REQUEST'].apply(lambda v: int(v.split(':')[0]))\n",
"df['ABOVE_AVG'] = df['MINUTES_TO_RESPOND'].apply(lambda v: 1 if v > average_time_to_respond else 0)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "starting-shark",
"metadata": {},
"source": [
"## Feature selection"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "adjusted-grounds",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" YEAR | \n",
" MONTH | \n",
" CODE | \n",
" DISTRICT | \n",
" HOSPITAL | \n",
" HOUR | \n",
" ABOVE_AVG | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2020 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" -1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2020 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" -1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 2020 | \n",
" 0 | \n",
" 2 | \n",
" 2 | \n",
" -1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2020 | \n",
" 0 | \n",
" 3 | \n",
" 3 | \n",
" -1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 2020 | \n",
" 0 | \n",
" 4 | \n",
" 3 | \n",
" -1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" YEAR MONTH CODE DISTRICT HOSPITAL HOUR ABOVE_AVG\n",
"0 2020 0 0 0 -1 0 0\n",
"1 2020 0 1 1 -1 0 1\n",
"2 2020 0 2 2 -1 0 0\n",
"3 2020 0 3 3 -1 0 0\n",
"4 2020 0 4 3 -1 0 0"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cols = ['YEAR', 'MONTH', 'CODE', 'DISTRICT', 'HOSPITAL', 'HOUR', 'ABOVE_AVG']\n",
"samur = df[cols]\n",
"samur.head()"
]
},
{
"cell_type": "markdown",
"id": "proper-carbon",
"metadata": {},
"source": [
"### Correlation Matrix"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "differential-belle",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.8/site-packages/numpy/lib/function_base.py:2642: RuntimeWarning: invalid value encountered in true_divide\n",
" c /= stddev[:, None]\n",
"/opt/conda/lib/python3.8/site-packages/numpy/lib/function_base.py:2643: RuntimeWarning: invalid value encountered in true_divide\n",
" c /= stddev[None, :]\n"
]
},
{
"data": {
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"corr_matrix = np.corrcoef(samur.T)\n",
"\n",
"plt.figure(figsize=(10, 10))\n",
"sns.heatmap(\n",
" corr_matrix,\n",
" cbar=False,\n",
" annot=True,\n",
" square=True,\n",
" xticklabels=samur.columns,\n",
" yticklabels=samur.columns)"
]
},
{
"cell_type": "markdown",
"id": "signed-pharmaceutical",
"metadata": {},
"source": [
"### Imbalance analysis"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "psychological-morocco",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ABOVE_AVG | \n",
" COUNT | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 1 | \n",
" 28268 | \n",
"
\n",
" \n",
" 0 | \n",
" 0 | \n",
" 75583 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ABOVE_AVG COUNT\n",
"1 1 28268\n",
"0 0 75583"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[:, [\"ABOVE_AVG\", \"MONTH\"]]\\\n",
" .groupby(\"ABOVE_AVG\")\\\n",
" .count()\\\n",
" .rename(columns={\"MONTH\": \"COUNT\"})\\\n",
" .reset_index()\\\n",
" .sort_values(\"ABOVE_AVG\", ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "bridal-oxford",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"columns = ['YEAR', 'MONTH', 'CODE', 'DISTRICT', 'HOSPITAL', 'HOUR']\n",
"target = 'ABOVE_AVG'\n",
"\n",
"X = samur[columns]\n",
"y = samur[target]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "mediterranean-louis",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: imbalanced-learn in /opt/conda/lib/python3.8/site-packages (0.8.0)\n",
"Requirement already satisfied: numpy>=1.13.3 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (1.20.0)\n",
"Requirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (1.0.0)\n",
"Requirement already satisfied: scikit-learn>=0.24 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (0.24.1)\n",
"Requirement already satisfied: scipy>=0.19.1 in /opt/conda/lib/python3.8/site-packages (from imbalanced-learn) (1.6.0)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from scikit-learn>=0.24->imbalanced-learn) (2.1.0)\n"
]
}
],
"source": [
"# Install a pip package in the current Jupyter kernel\n",
"import sys\n",
"!{sys.executable} -m pip install -U imbalanced-learn"
]
},
{
"cell_type": "markdown",
"id": "engaging-moral",
"metadata": {},
"source": [
"Because there are many samples, I think is going to be counter-productive to add more samples. I'm ok just having the same amount of positive samples than negative ones. For that I'm using an under-sampling algorithm from imlearn"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "bacterial-deployment",
"metadata": {},
"outputs": [],
"source": [
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"rus = RandomUnderSampler(random_state=0)\n",
"X, y = rus.fit_resample(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "needed-munich",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "realistic-maryland",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BEST 5 FEATURES: ['MONTH' 'CODE' 'DISTRICT' 'HOSPITAL' 'HOUR']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.8/site-packages/sklearn/feature_selection/_univariate_selection.py:114: UserWarning: Features [0] are constant.\n",
" warnings.warn(\"Features %s are constant.\" % constant_features_idx,\n",
"/opt/conda/lib/python3.8/site-packages/sklearn/feature_selection/_univariate_selection.py:116: RuntimeWarning: invalid value encountered in true_divide\n",
" f = msb / msw\n"
]
}
],
"source": [
"from sklearn.feature_selection import SelectKBest\n",
"from sklearn.feature_selection import f_classif\n",
"\n",
"fs = SelectKBest(f_classif, k=5)\n",
"X_selected = fs.fit_transform(X, y)\n",
"cols = fs.get_support(indices=True)\n",
"df_best = samur.iloc[:, cols]\n",
"\n",
"print(\"BEST 5 FEATURES: {}\".format(samur.iloc[:, cols].columns.values))"
]
},
{
"cell_type": "markdown",
"id": "worse-prisoner",
"metadata": {},
"source": [
"## CONFUSION MATRICES"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "respiratory-rochester",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.8/site-packages/sklearn/svm/_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
" warnings.warn(\"Liblinear failed to converge, increase \"\n"
]
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
"from sklearn.dummy import DummyClassifier\n",
"\n",
"# different classifiers\n",
"logistic = LogisticRegression().fit(X_train, y_train)\n",
"svc = LinearSVC(random_state=0).fit(X_train, y_train)\n",
"knn = KNeighborsClassifier(5).fit(X_train, y_train)\n",
"tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)\n",
"rff = RandomForestClassifier(max_depth=2, n_estimators=4, max_features=2).fit(X_train, y_train)\n",
"ada = AdaBoostClassifier(n_estimators=100).fit(X_train, y_train)\n",
"dummy = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)\n",
"classifiers = [dummy, logistic, svc, knn, tree, rff, ada]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "greater-formula",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DummyClassifier\n",
"===========================================\n",
"[[7053 0]\n",
" [7081 0]]\n",
"\n",
"LogisticRegression\n",
"===========================================\n",
"[[4944 2109]\n",
" [3685 3396]]\n",
"\n",
"LinearSVC\n",
"===========================================\n",
"[[ 0 7053]\n",
" [ 0 7081]]\n",
"\n",
"KNeighborsClassifier\n",
"===========================================\n",
"[[4273 2780]\n",
" [2732 4349]]\n",
"\n",
"DecisionTreeClassifier\n",
"===========================================\n",
"[[5825 1228]\n",
" [4486 2595]]\n",
"\n",
"RandomForestClassifier\n",
"===========================================\n",
"[[4631 2422]\n",
" [3381 3700]]\n",
"\n",
"AdaBoostClassifier\n",
"===========================================\n",
"[[4934 2119]\n",
" [3054 4027]]\n",
"\n"
]
}
],
"source": [
"from sklearn.metrics import confusion_matrix\n",
"\n",
"for clsf in classifiers:\n",
" y_predicted = clsf.predict(X_test)\n",
" print(clsf.__class__.__name__)\n",
" print('===========================================')\n",
" print(confusion_matrix(y_test, y_predicted))\n",
" print()"
]
},
{
"cell_type": "markdown",
"id": "junior-powell",
"metadata": {},
"source": [
"## CLASSIFICATION REPORTS"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "original-raleigh",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DummyClassifier\n",
"===========================================\n",
" precision recall f1-score support\n",
"\n",
" above_avg 0.50 1.00 0.67 7053\n",
"not_above_avg 0.00 0.00 0.00 7081\n",
"\n",
" accuracy 0.50 14134\n",
" macro avg 0.25 0.50 0.33 14134\n",
" weighted avg 0.25 0.50 0.33 14134\n",
"\n",
"LogisticRegression\n",
"===========================================\n",
" precision recall f1-score support\n",
"\n",
" above_avg 0.57 0.70 0.63 7053\n",
"not_above_avg 0.62 0.48 0.54 7081\n",
"\n",
" accuracy 0.59 14134\n",
" macro avg 0.59 0.59 0.59 14134\n",
" weighted avg 0.59 0.59 0.58 14134\n",
"\n",
"LinearSVC\n",
"===========================================\n",
" precision recall f1-score support\n",
"\n",
" above_avg 0.00 0.00 0.00 7053\n",
"not_above_avg 0.50 1.00 0.67 7081\n",
"\n",
" accuracy 0.50 14134\n",
" macro avg 0.25 0.50 0.33 14134\n",
" weighted avg 0.25 0.50 0.33 14134\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n",
"/opt/conda/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n",
"/opt/conda/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n",
"/opt/conda/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n",
"/opt/conda/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n",
"/opt/conda/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"KNeighborsClassifier\n",
"===========================================\n",
" precision recall f1-score support\n",
"\n",
" above_avg 0.61 0.61 0.61 7053\n",
"not_above_avg 0.61 0.61 0.61 7081\n",
"\n",
" accuracy 0.61 14134\n",
" macro avg 0.61 0.61 0.61 14134\n",
" weighted avg 0.61 0.61 0.61 14134\n",
"\n",
"DecisionTreeClassifier\n",
"===========================================\n",
" precision recall f1-score support\n",
"\n",
" above_avg 0.56 0.83 0.67 7053\n",
"not_above_avg 0.68 0.37 0.48 7081\n",
"\n",
" accuracy 0.60 14134\n",
" macro avg 0.62 0.60 0.57 14134\n",
" weighted avg 0.62 0.60 0.57 14134\n",
"\n",
"RandomForestClassifier\n",
"===========================================\n",
" precision recall f1-score support\n",
"\n",
" above_avg 0.58 0.66 0.61 7053\n",
"not_above_avg 0.60 0.52 0.56 7081\n",
"\n",
" accuracy 0.59 14134\n",
" macro avg 0.59 0.59 0.59 14134\n",
" weighted avg 0.59 0.59 0.59 14134\n",
"\n",
"AdaBoostClassifier\n",
"===========================================\n",
" precision recall f1-score support\n",
"\n",
" above_avg 0.62 0.70 0.66 7053\n",
"not_above_avg 0.66 0.57 0.61 7081\n",
"\n",
" accuracy 0.63 14134\n",
" macro avg 0.64 0.63 0.63 14134\n",
" weighted avg 0.64 0.63 0.63 14134\n",
"\n"
]
}
],
"source": [
"from sklearn.metrics import classification_report\n",
"\n",
"# creating their confusion matrices\n",
"for clsf in classifiers:\n",
" y_predicted = clsf.predict(X_test)\n",
" print(clsf.__class__.__name__)\n",
" print('===========================================')\n",
" print(classification_report(y_test, y_predicted, target_names = ['above_avg', 'not_above_avg']))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "restricted-nashville",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}