Commit b9cfff3e authored by Eva Zangerle's avatar Eva Zangerle
Browse files

fixed order of watermark cell, added classification notebookg

parent e71ab59e
......@@ -10,30 +10,6 @@
"Eva Zangerle"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7f813edb-5062-4654-8b92-b518cbae651e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-29 09:34:20\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -64,6 +40,30 @@
"data_dir = \"../data\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7f813edb-5062-4654-8b92-b518cbae651e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-29 09:34:20\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "markdown",
"id": "ecd30be6-6645-40c7-8458-d847038f00a0",
......
......@@ -10,30 +10,6 @@
"Eva Zangerle"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fbb83221-436f-4178-b4e6-e4b0293f3faf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-29 09:34:13\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "code",
"execution_count": 1,
......@@ -80,6 +56,30 @@
"data_dir = \"../data\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fbb83221-436f-4178-b4e6-e4b0293f3faf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-29 09:34:13\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "markdown",
"id": "747aa842-23a2-4659-abdc-3131a07894e2",
......
......@@ -11,30 +11,6 @@
"Eva Zangerle"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5c366d3c-4f97-48fa-b7fe-cda370045641",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-29 09:34:05\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "code",
"execution_count": 2,
......@@ -74,6 +50,30 @@
"data_dir = \"../data\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5c366d3c-4f97-48fa-b7fe-cda370045641",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-29 09:34:05\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "code",
"execution_count": 5,
......
......@@ -12,30 +12,6 @@
"Eva Zangerle"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a870c325-706d-4c07-bb57-e3b84322e6e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-29 09:33:54\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "code",
"execution_count": 1,
......@@ -69,40 +45,36 @@
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e40bcacf-a3ad-4d2d-8114-1979982d6f6f",
"execution_count": 2,
"id": "5406f6f3-1c06-4f3b-aaaf-9ac6f2967729",
"metadata": {},
"outputs": [],
"source": [
"data_dir = \"../data\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a870c325-706d-4c07-bb57-e3b84322e6e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-29 09:28:29\n",
"\n",
"Python implementation: CPython\n",
"Python version : 3.9.7\n",
"IPython version : 7.30.0\n",
"\n",
"Git hash: 32f20dee0a914472fdcdc1781c07a28a9f0d1828\n",
"Last updated: 2021-11-29 09:33:54\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t -g -v"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5406f6f3-1c06-4f3b-aaaf-9ac6f2967729",
"metadata": {},
"outputs": [],
"source": [
"data_dir = \"../data\""
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
......@@ -12,30 +12,6 @@
"Eva Zangerle"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "29a380d3-d574-47d1-8649-98bbd9c58edf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-29 09:33:47\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "code",
"execution_count": 1,
......@@ -63,6 +39,30 @@
"data_dir = \"../data\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "29a380d3-d574-47d1-8649-98bbd9c58edf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The watermark extension is already loaded. To reload it, use:\n",
" %reload_ext watermark\n",
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-29 09:33:47\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "markdown",
"id": "c4abdb51-195d-40d4-a46e-acacdd63e517",
......
{
"cells": [
{
"cell_type": "markdown",
"id": "3e125f13-6ae0-49a6-8916-bc7715d622e3",
"metadata": {
"tags": []
},
"source": [
"# Classification\n",
"Lecture Data Engineering and Analytics<br>\n",
"Eva Zangerle"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "3bf8eda6-1552-4bcc-bed9-3f42108b751a",
"metadata": {},
"outputs": [],
"source": [
"from pprint import pprint\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import seaborn as sns\n",
"from matplotlib.colors import ListedColormap\n",
"from sklearn import datasets\n",
"from sklearn.metrics import (\n",
" ConfusionMatrixDisplay,\n",
" PrecisionRecallDisplay,\n",
" RocCurveDisplay,\n",
" accuracy_score,\n",
" classification_report,\n",
" confusion_matrix,\n",
")\n",
"from sklearn.model_selection import (\n",
" GridSearchCV,\n",
" KFold,\n",
" StratifiedKFold,\n",
" cross_val_predict,\n",
" cross_val_score,\n",
" cross_validate,\n",
" train_test_split,\n",
")\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "d71d4893-1d48-40db-8f5f-40d706561497",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-11-30 16:54:31\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fa37ef66-9adc-43f4-bb87-7815ce6e485e",
"metadata": {},
"outputs": [],
"source": [
"# specify data directory\n",
"data_dir = \"../data\""
]
},
{
"cell_type": "markdown",
"id": "161004a8-3c68-460f-a391-dd16219d2685",
"metadata": {
"tags": []
},
"source": [
"## k Nearest Neighbors"
]
},
{
"cell_type": "markdown",
"id": "125ceb50-1129-4543-ae58-792ed0f91913",
"metadata": {},
"source": [
"The following initial examples are (again) based on the iris datasets (directly loaded via scikit-learn). The dataset consists of 50 samples from three species of the iris flower and describes its sepals (Kelchblatt) and petals (Blütenblatt) (length and width). More information on the dataset can be found here: https://en.wikipedia.org/wiki/Iris_flower_data_set."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9e66deca-28dc-4c78-8f59-1e6f207d2c78",
"metadata": {},
"outputs": [],
"source": [
"# load iris dataset via sklearn\n",
"def load_iris():\n",
" iris = datasets.load_iris()\n",
" # define X and y as data and target\n",
" # for now, we stick to two features as input: sepal length and width\n",
" X = iris.data[:, :2]\n",
" y = iris.target\n",
" return X, y, iris.target_names"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a1f5f0d1-0507-4144-a0bd-8a8815ef374b",
"metadata": {},
"outputs": [],
"source": [
"X, y, target_names = load_iris()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e607a69a-da27-48cb-ab16-e4a29d091c98",
"metadata": {},
"outputs": [],
"source": [
"# preprocess\n",
"scaler = StandardScaler()\n",
"X = scaler.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "79c4742e-7048-4f59-b22b-574693bdd9cb",
"metadata": {},
"outputs": [],
"source": [
"# split data into training and test\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)"
]
},
{
"cell_type": "markdown",
"id": "ce50464d-d77b-43fb-8f7a-4c3769bf49bf",
"metadata": {},
"source": [
"The following `plot_decision_boundaries` function is a convenience function to visualize the decision boundaries for the discussed classification algorithms. Code adapted from the official sklearn documentation at https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html."
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ebc5f1d9-3f6b-49a1-b5e3-5c4f4f4a5a41",
"metadata": {},
"outputs": [],
"source": [
"def plot_decision_boundaries(X, y, classifier, xlabel, ylabel):\n",
" # Plot the decision boundary. For that, we will assign a color to each\n",
" # point in the mesh [x_min, x_max]x[y_min, y_max].\n",
"\n",
" h = 0.05 # step size in the mesh\n",
"\n",
" # Create color maps\n",
" cmap_light = ListedColormap([\"yellow\", \"red\", \"green\"])\n",
" colors_light = [\"yellow\", \"red\", \"green\"]\n",
"\n",
" x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
" y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
"\n",
" # define mesh\n",
" xx, yy = np.meshgrid(\n",
" np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)\n",
" )\n",
"\n",
" # classify each point in mesh\n",
" Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])\n",
"\n",
" # Put the result into a color plot\n",
" Z = Z.reshape(xx.shape)\n",
" plt.figure(figsize=(8, 6))\n",
" plt.contourf(xx, yy, Z, cmap=cmap_light, alpha=0.15)\n",
"\n",
" # plot training points, once for each class\n",
" for label in set(y):\n",
" sns.scatterplot(\n",
" x=X[y == label, 0],\n",
" y=X[y == label, 1],\n",
" color=colors_light[label], # iris.target_names[y],\n",
" alpha=1.0,\n",
" edgecolor=\"black\",\n",
" )\n",
" plt.xlim(xx.min(), xx.max())\n",
" plt.ylim(yy.min(), yy.max())\n",
" plt.title(\"Contour Diagram\")\n",
" plt.xlabel(xlabel)\n",
" plt.ylabel(ylabel)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c791737c-fce8-4bd2-bd57-00ef9d84d1c7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"KNeighborsClassifier(algorithm='kd_tree', n_neighbors=7, p=1)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# compute kNN with k=5\n",
"# minkowski = generalization of Manhattan and Euclidean distance\n",
"# order p=1 -> Manhattan, p=2 Euclidean\n",
"knn = KNeighborsClassifier(\n",
" n_neighbors=7, algorithm=\"kd_tree\", leaf_size=30, metric=\"minkowski\", p=1\n",
")\n",
"knn.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "024c1726-a79b-4f0e-873b-67fda101bbff",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 576x432 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# inspect decision boundaries for different k\n",
"plot_decision_boundaries(\n",
" X_train, y_train, knn, \"sepal length (cm)\", \"sepal width (cm)\"\n",
")"
]
},
{
"cell_type": "markdown",
"id": "27a57141-8e4b-4418-b7be-6a73489d50b6",
"metadata": {
"tags": []
},
"source": [
"# Evaluation"
]
},
{
"cell_type": "markdown",
"id": "521e975b-cfc5-4873-bb25-f6cb5e7812cd",
"metadata": {},
"source": [
"After a first glimpse at a rather simple, lazy classification approach, we will discuss the systematic evaluation of classification approaches."
]
},
{
"cell_type": "markdown",
"id": "dc21295d-c155-490b-b2b2-98f088049f17",
"metadata": {},
"source": [
"In a first step, we will define a `sklearn.pipeline` to define full workflows for the computation of classifications."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "dad8b8ae-ed3d-46c1-b10b-7ef270a8b3f8",
"metadata": {},
"outputs": [],
"source": [
"# define pipeline (with names)\n",