{ "cells": [ { "cell_type": "markdown", "id": "edd718da-1295-49c4-b556-3cc7b718f93c", "metadata": {}, "source": [ "# Hypothesis Testing\n", "Lecture Data Engineering and Analytics
\n", "Eva Zangerle" ] }, { "cell_type": "code", "execution_count": 1, "id": "5b126eda-5b79-4531-b8ea-72898d09dc6d", "metadata": {}, "outputs": [], "source": [ "# import required packages\n", "import os\n", "import statistics\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from scipy import stats\n", "from sklearn.utils import resample" ] }, { "cell_type": "code", "execution_count": 2, "id": "5406f6f3-1c06-4f3b-aaaf-9ac6f2967729", "metadata": {}, "outputs": [], "source": [ "data_dir = \"../data\"" ] }, { "cell_type": "markdown", "id": "c4abdb51-195d-40d4-a46e-acacdd63e517", "metadata": {}, "source": [ "## Central Limit Theorem" ] }, { "cell_type": "markdown", "id": "33556303-a6c5-4637-aa75-c319ccc1aaba", "metadata": {}, "source": [ "In the following, we will investigate the central limit theorem. Code is adopted from (PracticalStatistics) https://github.com/gedeck/practical-statistics-for-data-scientists/. " ] }, { "cell_type": "code", "execution_count": 3, "id": "656050b5-45a5-4813-a228-e4b9ba03a111", "metadata": {}, "outputs": [], "source": [ "loans_income = pd.read_csv(\n", " os.path.join(data_dir, \"loans_income.csv\"), squeeze=True\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "id": "c0ef827e-8e5e-412f-a6ee-fe57f183d47e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 67000\n", "1 52000\n", "2 100000\n", "3 78762\n", "4 37041\n", " ... \n", "49995 40000\n", "49996 54000\n", "49997 50000\n", "49998 82000\n", "49999 70000\n", "Name: x, Length: 50000, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans_income" ] }, { "cell_type": "code", "execution_count": 5, "id": "668ab92d-218b-45bd-b3dc-ca4dbad4013e", "metadata": {}, "outputs": [], "source": [ "# sample 1000 rows\n", "sample_data = pd.DataFrame(\n", " {\n", " \"income\": loans_income.sample(1000),\n", " \"type\": \"Data\",\n", " }\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "id": "f65ee2f1-c5bb-4146-9645-711fa3a2dce6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " income type\n", "19678 48300.0 Data\n", "5695 40000.0 Data\n", "13497 60000.0 Data\n", "6298 30000.0 Data\n", "1687 75000.0 Data\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
incometype
1967848300.00Data
569540000.00Data
1349760000.00Data
629830000.00Data
168775000.00Data
.........
499578028.85Mean of 20, 5000 samples
499682030.60Mean of 20, 5000 samples
499775391.50Mean of 20, 5000 samples
499856003.35Mean of 20, 5000 samples
499965848.20Mean of 20, 5000 samples
\n", "

9000 rows × 2 columns

\n", "
" ], "text/plain": [ " income type\n", "19678 48300.00 Data\n", "5695 40000.00 Data\n", "13497 60000.00 Data\n", "6298 30000.00 Data\n", "1687 75000.00 Data\n", "... ... ...\n", "4995 78028.85 Mean of 20, 5000 samples\n", "4996 82030.60 Mean of 20, 5000 samples\n", "4997 75391.50 Mean of 20, 5000 samples\n", "4998 56003.35 Mean of 20, 5000 samples\n", "4999 65848.20 Mean of 20, 5000 samples\n", "\n", "[9000 rows x 2 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# draw random samples from sample data\n", "sample_mean_05 = pd.DataFrame(\n", " {\n", " \"income\": [loans_income.sample(5).mean() for _ in range(1000)],\n", " \"type\": \"Mean of 5\",\n", " }\n", ")\n", "\n", "sample_mean_20 = pd.DataFrame(\n", " {\n", " \"income\": [loans_income.sample(20).mean() for _ in range(1000)],\n", " \"type\": \"Mean of 20\",\n", " }\n", ")\n", "\n", "\n", "sample_mean_20_2 = pd.DataFrame(\n", " {\n", " \"income\": [loans_income.sample(20).mean() for _ in range(5000)],\n", " \"type\": \"Mean of 20, 5000 samples\",\n", " }\n", ")\n", "\n", "sample_mean_50 = pd.DataFrame(\n", " {\n", " \"income\": [loans_income.sample(50).mean() for _ in range(1000)],\n", " \"type\": \"Mean of 50\",\n", " }\n", ")\n", "\n", "results = pd.concat(\n", " [\n", " sample_data,\n", " sample_mean_05,\n", " sample_mean_20,\n", " sample_mean_50,\n", " sample_mean_20_2,\n", " ]\n", ")\n", "print(results.head())\n", "results" ] }, { "cell_type": "markdown", "id": "15624c1f-fa95-4f15-80a6-8ae83ebb56f8", "metadata": {}, "source": [ "When inspecting the plots of the statistic distributions, we observe that the larger our samples are, the more similar the statistic distribution (distribution of means of samples) gets compared to a normal distribution." ] }, { "cell_type": "code", "execution_count": 7, "id": "52383e17-e209-4b42-85f0-b364b25ee70c", "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAARkAAALICAYAAABLmoQAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8/fFQqAAAACXBIWXMAAAsTAAALEwEAmpwYAAA1mUlEQVR4nO3de7RV9Znm++8jXjBqBCJhMEQajaRNWuPlbNGoqYPJkCBJGpMyXo5HaI8Jp5PoaDsdh2RY1VqxRg/SZXcsctEQQwlVRqNGjxhtCCKadHcUtoaAprHYMdBuCrmIRRKtJIW+54/5W2Gy3Ze5916/tfZaPJ8x1lhzvnPONV82m4d5Weu3FBGYmeVyULMbMLP25pAxs6wcMmaWlUPGzLJyyJhZVg4ZM8vKIWN1JelNSeskvSDp55L+g6R+f88kTZH0fzWqR2ssh4zV2z9FxGkR8a+AC4ALgZsG2GYK4JBpU/Kb8ayeJP02Io4szZ8ArAWOAf4F8LfAEWnxNRHxPyU9DbwP+BWwBHiot/Ua9EewOnPIWF31DJlU+0fgXwK/Ad6KiN9JmgrcExEdkqYDX4qIj6f139Hbeo38c1j9HNzsBuyAcgjwDUmnAW8C7x3metYCHDKWVTpdehPYQXFtZjtwKsX1wN/1sdm/r7ietQBf+LVsJI0H7gC+EcV5+dHAtoh4C7gSGJVW/Q1wVGnTvtazFuRrMlZXkt4ENlCc8uyluID7XyPirXR95QdAAMuBL0TEkZIOAVYA7wLuAn7Y23qN/rNYfThkzCwrny6ZWVYOGTPLyiFjZlk5ZMwsq7YMmZkzZwbFnQk//PBj+I9hacuQ2bVrV7NbMLOkLUPGzEYOh4yZZeWQMbOsHDJmlpVDxsyycsiYWVYOGTPLyiFjZlk5ZMwsK4eMmWXlkDGzrBwyZpaVQ8bMsnLImFlWDhkzy8ohY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVg4ZM8vKIWNmWTlkzCwrh4yZZeWQMbOsHDJmllW2kJG0WNIOSc+XauMkrZS0KT2PTXVJWiipS9J6SWeUtpmb1t8kaW6ufs0sj5xHMncBM3vU5gOrImIqsCrNA1wITE2PecDtUIQScBNwFjANuKkWTGbWGrKFTET8GNjdozwbWJKmlwAXlepLo/A0MEbSROCjwMqI2B0RrwEreXtwmdkI1uhrMhMiYluafgWYkKaPBV4urdedan3V30bSPEmdkjp37txZ367NbMiaduE3IgKIOr7eoojoiIiO8ePH1+tlzWyYGh0y29NpEOl5R6pvBY4rrTcp1fqqm1mLaHTILANqd4jmAg+X6nPSXaazgT3ptGoFMEPS2HTBd0aqmVmLODjXC0u6B5gOHCOpm+Iu0QLgPklXA1uAS9LqjwGzgC7gDeAqgIjYLekWYG1a7ysR0fNispmNYCoujbSXjo6O6OzsbHYbZu1Cw9nY7/g1s6wcMmaWVaWQkXRulZqZWU9Vj2S+XrFmZraffu8uSfogcA4wXtIXS4veCYzK2ZiZtYeBbmEfChyZ1juqVP81cHGupsysffQbMhHxFPCUpLsiYkuDejKzNlL1zXiHSVoETClvExEfztGUmbWPqiFzP3AHcCfwZr52zKzdVA2ZvRFxe9ZOzKwtVb2F/Yikz0uamIbQHJdGrTMz61fVI5naJ6evL9UCOKG+7ZhZu6kUMhFxfO5GzKw9VQoZSXN6q0fE0vq2Y2btpurp0pml6dHAR4DnAIeMmfWr6unSteV5SWOAe3M0ZGbtZahDPbwO+DqNmQ2o6jWZR9j3zQKjgPcB9+VqyszaR9VrMreWpvcCWyKiO0M/ZtZmKp0upQ9KbqT4JPZY4A85mzKz9lF1ZLxLgDXApym+YeAZSR7qwcwGVPV06UbgzIjYASBpPPA48ECuxsysPVS9u3RQLWCSVwexrZkdwKoeySyXtAK4J81fSvGFbGZm/RpojN8TgQkRcb2kTwHnpUU/Be7O3ZyZtb6BjmRuA74MEBEPAg8CSDolLftExt7MrA0MdF1lQkRs6FlMtSlZOjKztjJQyIzpZ9nhdezDzNrUQCHTKemzPYuSPgM8O9SdStosaYOkdZI6U22cpJWSNqXnsakuSQsldUlaL+mMoe7XzBpvoGsy1wEPSbqCfaHSQfF9TJ8c5r7Pj4hdpfn5wKqIWCBpfpq/AbgQmJoeZwG3p2czawEDfe/SduAcSecDJ6fyoxHxRIZeZgPT0/QS4EmKkJkNLI2IAJ6WNEbSxIjYlqEHM6uzquPJrAZW13G/AfxIUgDfjohFFBeZa8HxCjAhTR8LvFzatjvV9gsZSfOAeQCTJ0+uY6tmNhxV34xXb+dFxFZJ7wZWStpYXhgRkQKoshRUiwA6OjoGta2Z5dOUjwZExNb0vAN4CJgGbJc0ESA91z7GsBU4rrT5pFQzsxbQ8JCRdISko2rTwAzgeWAZ+756ZS7wcJpeBsxJd5nOBvb4eoxZ62jG6dIEijtWtf1/LyKWS1oL3CfpamALxZASUHxGahbQBbwBXNX4ls1sqBoeMhHxEnBqL/VXKb4FoWc9gC80oDUzy8DDNZhZVg4ZM8vKIWNmWTlkzCwrh4yZZeWQMbOsHDJmlpVDxsyycsiYWVYOGTPLyiFjZlk5ZMwsK4eMmWXlkDGzrBwyZpaVQ8bMsnLImFlWDhkzy8ohY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVs34mlobginzH6203uYFH8vcidngOGQarGpYjITXd2BZPbRMyEiaCfw1MAq4MyIWNLmltjecwHJAWU1LhIykUcA3gQuAbmCtpGUR8YvmdmZ96RlQwwmd3sLOIdY6WiJkgGlAV0S8BCDpXmA2MGJCJvdpUKsbyaeJQw0sh181iohm9zAgSRcDMyPiM2n+SuCsiLimtM48YF6a/ZfAixVe+hhgV53bbST331wHSv+7ImLmUHfSKkcyA4qIRcCiwWwjqTMiOjK1lJ37by73X02rvE9mK3BcaX5SqpnZCNcqIbMWmCrpeEmHApcBy5rck5lV0BKnSxGxV9I1wAqKW9iLI+KFOrz0oE6vRiD331zuv4KWuPBrZq2rVU6XzKxFOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVg4ZM8vKIWNmWTlkzCwrh4yZZeWQMbOsHDLWJ0kh6e9K8wdL2inph03q5yRJ6yT9TNJ7eix7UtKLafk6Se9uRo/2di0x1IM1zevAyZIOj4h/ohjIvZmDhV0EPBARf9nH8isiorOB/VgFPpKxgTwG1EbHvhy4p7ZA0hGSFktak44uZqf6FEk/kfRcepyT6tPTEccDkjZKuluSeu5Q0mmSnpa0XtJDksZKmgVcB3xO0urcf2irH4eMDeRe4DJJo4EPAM+Ult0IPBER04Dzgb+SdASwA7ggIs4ALgUWlrY5nSIs3g+cAJzbyz6XAjdExAeADcBNEfEYcAfwtYg4v49e/yadKv15b+FlzeGQsX5FxHpgCsVRzGM9Fs8A5ktaBzwJjAYmA4cA35G0AbifIlBq1kREd0S8BaxLr/1Hko4GxkTEU6m0BPiTCq1eERGnAB9Kjysr/QEtO1+TsSqWAbcC04F3leoC/jQi9vv6GUk3A9uBUyn+I/tdafHvS9NvUqffwYjYmp5/I+l7FN/VtbQer23D4yMZq2Ix8BcRsaFHfQVwbe3URNLpqX40sC0drVxJMS5zJRGxB3hN0odS6UrgqX42qd31OiZNHwJ8HHi+6j4tLx/J2IAiopv9r6vU3ALcBqyXdBDwK4p/4N8CfiBpDrCc4i7VYMwF7pD0DuAl4KoB1j8MWJECZhTwOPCdQe7TMvFA4maWlU+XzCwrh4yZZeWQMbOsHDJmllVbhszMmTMD8MMPP+rzGJa2DJldu3Y1uwUzS9oyZMxs5HDImFlWDhkzy8ohY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVg4ZM8vKIWNmWTlkzCwrh4yZZeWQMbOsHDJmlpVDxsyycsiYWVYOGTPLyiFjZlk5ZMwsK4eMmWXlkDGzrBwyZpaVQ8bMsnLImFlWDhkzy8ohY2ZZZQsZSYsl7ZD0fKk2TtJKSZvS89hUl6SFkrokrZd0RmmbuWn9TZLm5urXzPLIeSRzFzCzR20+sCoipgKr0jzAhcDU9JgH3A5FKAE3AWcB04CbasFkZq0hW8hExI+B3T3Ks4ElaXoJcFGpvjQKTwNjJE0EPgqsjIjdEfEasJK3B5eZjWCNviYzISK2pelXgAlp+ljg5dJ63anWV/1tJM2T1Cmpc+fOnfXt2syGrGkXfiMigKjj6y2KiI6I6Bg/fny9XtbMhqnRIbM9nQaRnnek+lbguNJ6k1Ktr7qZtYhGh8wyoHaHaC7wcKk+J91lOhvYk06rVgAzJI1NF3xnpJqZtYiDc72wpHuA6cAxkrop7hItAO6TdDWwBbgkrf4YMAvoAt4ArgKIiN2SbgHWpvW+EhE9Lyab2Qim4tJIe+no6IjOzs5mt2HWLjScjf2OXzPLyiFjZllVChlJ51apmZn1VPVI5usVa2Zm++n37pKkDwLnAOMlfbG06J3AqJyNmVl7GOgW9qHAkWm9o0r1XwMX52rKzNpHvyETEU8BT0m6KyK2NKgnM2sjVd+Md5ikRcCU8jYR8eEcTZlZ+6gaMvcDdwB3Am/ma8fM2k3VkNkbEbdn7cTM2lLVW9iPSPq8pIlpCM1xadQ6M7N+VT2SqX1y+vpSLYAT6tuOmbWbSiETEcfnbsTM2lOlkJE0p7d6RCytbztm1m6qni6dWZoeDXwEeA5wyJhZv6qeLl1bnpc0Brg3R0Nm1l6GOtTD64Cv05jZgKpek3mEfd8sMAp4H3BfrqbMrH1UvSZza2l6L7AlIroz9GNmbabS6VL6oORGik9ijwX+kLMpM2sfVUfGuwRYA3ya4hsGnpHkoR7MbEBVT5duBM6MiB0AksYDjwMP5GrMzNpD1btLB9UCJnl1ENua2QGs6pHMckkrgHvS/KUUX8hmZtavgcb4PRGYEBHXS/oUcF5a9FPg7tzNmVnrG+hI5jbgywAR8SDwIICkU9KyT2TszczawEDXVSZExIaexVSbkqUjM2srA4XMmH6WHT7UnUraLGmDpHWSOlNtnKSVkjal57GpLkkLJXVJWi/pjKHu18wab6CQ6ZT02Z5FSZ8Bnh3mvs+PiNMioiPNzwdWRcRUYFWaB7gQmJoe8wAPA2rWQga6JnMd8JCkK9gXKh0U38f0yTr3MhuYnqaXAE8CN6T60ogI4GlJYyRNjIhtdd6/mWUw0PcubQfOkXQ+cHIqPxoRTwxzvwH8SFIA346IRRTXf2rB8QowIU0fC7xc2rY71RwyZi2g6ngyq4HVddzveRGxVdK7gZWSNvbYX6QAqkzSPIrTKSZPnly/Ts1sWJryrt2I2JqedwAPAdOA7ZImAqTn2juMtwLHlTaflGo9X3NRRHRERMf48eNztm9mg9DwkJF0hKSjatPADOB5YBn7vhVhLvBwml4GzEl3mc4G9vh6jFnrqPqxgnqaQHExubb/70XEcklrgfskXQ1sofi0NxQfX5gFdAFvAFc1vmUzG6qGh0xEvASc2kv9VYoBynvWA/hCA1ozswz8SWozy8ohY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVg4ZM8vKIWNmWTlkzCwrh4yZZeWQMbOsHDJmlpVDxsyycsiYWVYOGTPLyiFjZlk5ZMwsK4eMmWXlkDGzrJrxlSjWYFPmPzrgOpsXfKwBndiByCHTZqoEilkjOWQM6D2cfHRj9eCQaWE+arFW4JCxPvnoxurBIWOD4uCxwXLItIiRfGrk4LH+tEzISJoJ/DUwCrgzIhY0uSXrR8/gcegcuFoiZCSNAr4JXAB0A2slLYuIXzS3M6vKRzsHrpYIGWAa0BURLwFIuheYDbRlyIzkU6N6cvAcGFolZI4FXi7NdwNnlVeQNA+Yl2Z/K+nFCq97DLCrLh02R9v1r682qZOhabuffx+WR8TMoe6kVUJmQBGxCFg0mG0kdUZER6aWsnP/zeX+q2mVD0huBY4rzU9KNTMb4VolZNYCUyUdL+lQ4DJgWZN7MrMKWuJ0KSL2SroGWEFxC3txRLxQh5ce1OnVCOT+m8v9V6CIaMR+zOwA1SqnS2bWohwyZpaVQ8bMsnLImFlWDhkzy8ohY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVg4Z24+kkPR3pfmDJe2U9MMm9XOSpHWSfibpPaX6OyQ9KmmjpBckLSgtO0zS9yV1SXpG0pRm9G4Fh4z19DpwsqTD0/wFNHeAsIuAByLi9Ij4ZY9lt0bEScDpwLmSLkz1q4HXIuJE4GtAaw3q2WYcMtabx4DaiN6XA/fUFkg6QtJiSWvS0cXsVJ8i6SeSnkuPc1J9uqQnJT2QjjrulqSeO5R0mqSnJa2X9JCksZJmAdcBn5O0urx+RLwREavT9B+A5yhGTIRikPklafoB4CO97dMawyFjvbkXuEzSaOADwDOlZTcCT0TENOB84K8kHQHsAC6IiDOAS4GFpW1OpwiL9wMnAOf2ss+lwA0R8QFgA3BTRDwG3AF8LSLO76tZSWOATwCrUumPA89HxF5gD/Cuqn94q6+WGBnPGisi1qfrGJdTHNWUzQD+taQvpfnRwGTgH4BvSDoNeBN4b2mbNRHRDSBpHTAF+O+1hZKOBsZExFOptAS4v0qvkg6mONJaWPvKHBtZHDLWl2XArcB09j8KEPCnEbHfV85IuhnYDpxKcYT8u9Li35em36S+v3eLgE0RcVupVht4vjuF0NHAq3Xcpw2CT5esL4uBv4iIDT3qK4Bra9c4JJ2e6kcD2yLiLeBKirGYK4mIPcBrkj6USlcCT/WzCWnff5n2e12PRcuAuWn6YorTO48z2yQOGetVRHRHxMJeFt0CHAKsl/RCmgf4FjBX0s+BkyjuUg3GXIrrO+uB04Cv9LeypEkU14feDzyXbnN/Ji3+LvAuSV3AF4H5g+zF6sgDiZtZVj6SMbOsHDJmlpVDxsyycsiYWVZtGTIzZ84MwA8//KjPY1jaMmR27drV7BbMLGnLkDGzkcMhY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVg4ZM8vKIWNmWTlkzCwrh4yZZeWQMbOsHDJmlpVDxsyycsiYWVYOGTPLyiFjZlk5ZMwsK4eMmWXlkDGzrBwyZpaVQ8bMsnLImFlWDhkzy8ohY2ZZOWTMLKtsISNpsaQdkp4v1cZJWilpU3oem+qStFBSl6T1ks4obTM3rb9J0txc/ZpZHjmPZO4CZvaozQdWRcRUYFWaB7gQmJoe84DboQgl4CbgLGAacFMtmMysNWQLmYj4MbC7R3k2sCRNLwEuKtWXRuFpYIykicBHgZURsTsiXgNW8vbgMrMRrNHXZCZExLY0/QowIU0fC7xcWq871fqqm1mLaNqF34gIIOr1epLmSeqU1Llz5856vayZDVOjQ2Z7Og0iPe9I9a3AcaX1JqVaX/W3iYhFEdERER3jx4+ve+NmNjSNDpllQO0O0Vzg4VJ9TrrLdDawJ51WrQBmSBqbLvjOSDUzaxEH53phSfcA04FjJHVT3CVaANwn6WpgC3BJWv0xYBbQBbwBXAUQEbsl3QKsTet9JSJ6Xkw2sxFMxaWR9tLR0RGdnZ3NbsOsXWg4G/sdv2aWlUPGzLJyyJhZVpVCRtK5VWpmZj1VPZL5esWamdl++r2FLemDwDnAeElfLC16JzAqZ2Nm1h4Gep/MocCRab2jSvVfAxfnasrM2ke/IRMRTwFPSborIrY0qCczayNV3/F7mKRFwJTyNhHx4RxNmVn7qBoy9wN3AHcCb+Zrx8zaTdWQ2RsRt2ftxMzaUtVb2I9I+rykiWmc3nFpaEwzs35VPZKpDc9wfakWwAn1bcfM2k2lkImI43M3YmbtqVLISJrTWz0ilta3HTNrN1VPl84sTY8GPgI8BzhkzKxfVU+Xri3PSxoD3JujITNrL0Md6uF1wNdpzGxAVa/JPMK+ry8ZBbwPuC9XU2bWPqpek7m1NL0X2BIR3Rn6MbM2U+l0KX1QciPFJ7HHAn/I2ZSZtY+qI+NdAqwBPk3xNSbPSPJQD2Y2oKqnSzcCZ0bEDgBJ44HHgQdyNWZm7aHq3aWDagGTvDqIbc3sAFb1SGa5pBXAPWn+UopvfTQz69dAY/yeCEyIiOslfQo4Ly36KXB37ubMrPUNdCRzG/BlgIh4EHgQQNIpadknMvZmZm1goOsqEyJiQ89iqk3J0pGZtZWBQmZMP8sOH+pOJW2WtEHSOkmdqTZO0kpJm9Lz2FSXpIWSuiStl3TGUPdrZo03UMh0Svpsz6KkzwDPDnPf50fEaRHRkebnA6siYiqwKs0DXAhMTY95gIcBNWshA12TuQ54SNIV7AuVDorvY/pknXuZDUxP00uAJ4EbUn1pRATwtKQxkiZGxLY679/MMhjoe5e2A+dIOh84OZUfjYgnhrnfAH4kKYBvR8Qiius/teB4BZiQpo8FXi5t251q+4WMpHkURzpMnjx5mO2ZWb1UHU9mNbC6jvs9LyK2Sno3sFLSxh77ixRAlaWgWgTQ0dExqG3NLJ+mvGs3Iram5x3AQ8A0YLukiQDpufYO463AcaXNJ6WambWAhoeMpCMkHVWbBmYAzwPL2PetCHOBh9P0MmBOust0NrDH12PMWkfVjxXU0wSKi8m1/X8vIpZLWgvcJ+lqYAvFp72h+PjCLKALeAO4qvEtm9lQNTxkIuIl4NRe6q9SDFDesx7AFxrQmpll4E9Sm1lWDhkzy8ohY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVg4ZM8vKIWNmWTXjA5I2AkyZ/+h+85sXfKxJnVi785GMmWXlkDGzrHy6ZMDbT5/Ap1BWHz6SMbOsfCRzAOjtKMWsUXwkY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLLyLWzrk9+gZ/XgIxkzy8pHMm3Gb7yzkcZHMmaWlUPGzLJqmdMlSTOBvwZGAXdGxIImt3RA8sVgG6yWCBlJo4BvAhcA3cBaScsi4hfN7ay5fP3FWkFLhAwwDeiKiJcAJN0LzAYO6JAZKXx0Y/1plZA5Fni5NN8NnFVeQdI8YF6a/a2kFyu87jHArrp02Bwjtn99tdJqI7b/ig6U/pdHxMyh7qRVQmZAEbEIWDSYbSR1RkRHppayc//N5f6raZW7S1uB40rzk1LNzEa4VgmZtcBUScdLOhS4DFjW5J7MrIKWOF2KiL2SrgFWUNzCXhwRL9ThpQd1ejUCuf/mcv8VKCIasR8zO0C1yumSmbUoh4yZZeWQMbOsHDJmlpVDxsyycsiYWVYOGTPLyiFjZlk5ZMwsK4eMmWXlkDGzrBwyZpaVQ8b2Iykk/V1p/mBJOyX9sEn9nCRpnaSfSXpPj2VPSnoxLV8n6d2pfpik70vqkvSMpCnN6N0KDhnr6XXgZEmHp/kLaO4AYRcBD0TE6RHxy16WXxERp6XHjlS7GngtIk4EvgZUGwzUsnDIWG8eA2ojgV8O3FNbIOkISYslrUlHF7NTfYqkn0h6Lj3OSfXp6YjjAUkbJd0tST13KOk0SU9LWi/pIUljJc0CrgM+J2n1IPqfDSxJ0w8AH+ltn9YYDhnrzb3AZZJGAx8AniktuxF4IiKmAecDfyXpCGAHcEFEnAFcCiwsbXM6RVi8HzgBOLeXfS4FboiIDwAbgJsi4jHgDuBrEXF+H73+TTpV+vNSkPxx4PmI2AvsAd41mB+A1Y9Dxt4mItYDUyiOYh7rsXgGMF/SOuBJYDQwGTgE+I6kDcD9FIFSsyYiuiPiLWBdeu0/knQ0MCYinkqlJcCfVGj1iog4BfhQelxZ6Q9oDdUSw29aUywDbgWms/9RgIA/jYj9vnJG0s3AduBUiv+8flda/PvS9JvU6fcuIram599I+h7F93MtZd/A892SDgaOBl6txz5t8HwkY31ZDPxFRGzoUV8BXFs7NZF0eqofDWxLRytXUozFXElE7AFek/ShVLoSeKqfTWp3vY5J04cAHweeT4uXAXPT9MUUp3ceZ7ZJfCRjvYqIbva/rlJzC3AbsF7SQcCvKP6Bfwv4gaQ5wHKKu1SDMRe4Q9I7gJeAqwZY/zBgRQqYUcDjwHfSsu8CfyupC9hN8e0W1iQeSNzMsvLpkpll5ZAxs6wcMmaWlUPGzLJqy5CZOXNmAH744Ud9HsPSliGza9euZrdgZklbhoyZjRwOGTPLyiFjZlk5ZMwsK4eMmWXlkDGzrBwyZpaVQ8bMsnLImFlWDhkzy8ohY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVg4ZM8vKIWNmWTlkzCwrh4yZZeWQMbOsHDJmlpVDxsyycsiYWVYOGTPLyiFjZlllCxlJiyXtkPR8qTZO0kpJm9Lz2FSXpIWSuiStl3RGaZu5af1Nkubm6tfM8sh5JHMXMLNHbT6wKiKmAqvSPMCFwNT0mAfcDkUoATcBZwHTgJtqwWRmrSFbyETEj4HdPcqzgSVpeglwUam+NApPA2MkTQQ+CqyMiN0R8RqwkrcHl5mNYI2+JjMhIral6VeACWn6WODl0nrdqdZX3cxaRNMu/EZEAFGv15M0T1KnpM6dO3fW62XNbJgaHTLb02kQ6XlHqm8FjiutNynV+qq/TUQsioiOiOgYP3583Rs3s6FpdMgsA2p3iOYCD5fqc9JdprOBPem0agUwQ9LYdMF3RqqZWYs4ONcLS7oHmA4cI6mb4i7RAuA+SVcDW4BL0uqPAbOALuAN4CqAiNgt6RZgbVrvKxHR82KymY1gKi6NtJeOjo7o7Oxsdhtm7ULD2djv+DWzrBwyZpZVpZCRdG6VmplZT1WPZL5esWZmtp9+7y5J+iBwDjBe0hdLi94JjMrZmJm1h4FuYR8KHJnWO6pU/zVwca6mzKx99BsyEfEU8JSkuyJiS4N6MrM2UvXNeIdJWgRMKW8TER/O0ZSZtY+qIXM/cAdwJ/BmvnbMrN1UDZm9EXF71k7MrC1VvYX9iKTPS5qYhtAcl0atMzPrV9Ujmdonp68v1QI4ob7tmFm7qRQyEXF87kbMrD1VChlJc3qrR8TS+rZjZu2m6unSmaXp0cBHgOcAh4yZ9avq6dK15XlJY4B7czRkZu1lqEM9vA74Oo2ZDajqNZlH2PfNAqOA9wH35WrKzNpH1Wsyt5am9wJbIqI7Qz9m1mYqnS6lD0pupPgk9ljgDzmbMrP2UXVkvEuANcCnKb5h4BlJHurBzAZU9XTpRuDMiNgBIGk88DjwQK7GzKw9VL27dFAtYJJXB7GtmR3Aqh7JLJe0ArgnzV9K8YVsZmb9GmiM3xOBCRFxvaRPAeelRT8F7s7dnJm1voGOZG4DvgwQEQ8CDwJIOiUt+0TG3sysDQx0XWVCRGzoWUy1KVk6MrO2MlDIjOln2eF17MPM2tRAIdMp6bM9i5I+Azw71J1K2ixpg6R1kjpTbZyklZI2peexqS5JCyV1SVov6Yyh7tfMGm+gazLXAQ9JuoJ9odJB8X1Mnxzmvs+PiF2l+fnAqohYIGl+mr8BuBCYmh5nAbenZzNrAQN979J24BxJ5wMnp/KjEfFEhl5mA9PT9BLgSYqQmQ0sjYgAnpY0RtLEiNiWoQczq7Oq48msBlbXcb8B/EhSAN+OiEUUF5lrwfEKMCFNHwu8XNq2O9X2CxlJ84B5AJMnT65jq2Y2HFXfjFdv50XEVknvBlZK2lheGBGRAqiyFFSLADo6Oga1rZnl05SPBkTE1vS8A3gImAZslzQRID3XPsawFTiutPmkVDOzFtDwkJF0hKSjatPADOB5YBn7vnplLvBwml4GzEl3mc4G9vh6jFnraMbp0gSKO1a1/X8vIpZLWgvcJ+lqYAvFkBJQfEZqFtAFvAFc1fiWzWyoGh4yEfEScGov9VcpvgWhZz2ALzSgNTPLwMM1mFlWDhkzy6pZt7CtyabMf3S/+c0LPtakTqzd+UjGzLJyyJhZVg4ZM8vKIWNmWTlkzCwrh4yZZeWQMbOs/D4ZA97+vhnwe2esPnwkY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVn6fzAGgt/fAmDWKj2TMLCuHjJll5ZAxs6wcMmaWlS/8Wp/8oUmrBx/JmFlWDhkzy8ohY2ZZOWTMLKuWCRlJMyW9KKlL0vxm92Nm1bTE3SVJo4BvAhcA3cBaScsi4hfN7Wzk8UcIbKRpiZABpgFdEfESgKR7gdmAQ6bBfFvbBqtVQuZY4OXSfDdwVnkFSfOAeWn2t5JerPC6xwC76tJhc4yI/vXVIW86IvofhgOl/+URMXOoO2mVkBlQRCwCFg1mG0mdEdGRqaXs3H9zuf9qWuXC71bguNL8pFQzsxGuVUJmLTBV0vGSDgUuA5Y1uSczq6AlTpciYq+ka4AVwChgcUS8UIeXHtTp1Qjk/pvL/VegiGjEfszsANUqp0tm1qIcMmaWlUPGzLJyyJhZVg4ZM8vKIWNmWTlkzCwrh4yZZeWQMbOsHDJmlpVDxsyycsiYWVYOmSaRFJL+rjR/sKSdkn7YpH5OkrRO0s8kvadUf4ekRyVtlPSCpAWlZYdJ+n4a3P0ZSVMq7GezpA1pX52l+jhJKyVtSs9jU12SFqZ9rJd0RmmbuWn9TZLm1u2HMUSSbpb0pWb3MdI4ZJrndeBkSYen+Qto7kBcFwEPRMTpEfHLHstujYiTgNOBcyVdmOpXA69FxInA14CqA3GeHxGn9RiVbT6wKiKmAqvSPMCFwNT0mAfcDkUoATdRDMM6DbipFkw2sjhkmusxoDYK9+XAPbUFko6QtFjSmnR0MTvVp0j6iaTn0uOcVJ8u6UlJD6SjjrslqecOJZ0m6el0VPCQpLGSZgHXAZ+TtLq8fkS8ERGr0/QfgOcoRiaEYjD3JWn6AeAjve2zovJrLaEIvVp9aRSeBsZImgh8FFgZEbsj4jVgJfC2cWglLZD0i/TnvTXVPpGOvH4m6XFJE1L9ZklL0s93i6RPSfrP6chruaRD0nqbS/U1kk7sZb/vSds8m17vpFT/tKTnJf1c0o+H+LNqLRHhRxMewG+BD1D84xwNrAOmAz9My/8T8H+n6THA3wNHAO8ARqf6VKAzTU8H9lAEwEHAT4HzetnveuD/TNNfAW5L0zcDXxqg5zHAS8AJaf55YFJp+S+BYwZ4jV9RBNWzwLxS/R9L06rNAz8s/zkojnI6gC8Bf1aq/3nP/oF3AS+yb9ykMel5bKn2GeC/lH4G/x04BDgVeAO4MC17CLgoTW8GbkzTc0p/Z3/8GaY+p6bps4An0vQG4NhyP+3+aImR8dpVRKxP1zEupziqKZsB/OvSOf5oYDLwD8A3JJ0GvAm8t7TNmojoBpC0DphC8Y+GVDua4hf7qVRaAtxfpVdJB1McaS2M9NU0Q3ReRGyV9G5gpaSNEbHf/+gREZLqMZraHuB3wHfTta7a9a5JwPfTEdGhFMFX898i4p8lbaAYhXF5qm+g+HnW3FN6/lp5p5KOBM4B7i8d2B2Wnv8HcJek+4AHh/WnaxE+XWq+ZcCtlE6VEgF/GsW1i9MiYnJE/C/g3wPbKf6n7aD4R1Lz+9L0m9R3eNVFwKaIuK1U++MA7ymEjgZe7e9FImJret5BcXQwLS3anv7Rk5539NxHUhtEfsDB5SNib3r9B4CPsy8wvg58IyJOAf5figCv+X3a9i3gnyMdcgBvsf/PM/qYhuLf1T+W/u5Oi4j3pdf9t8Cfpd6flfQu2pxDpvkWA38RERt61FcA19aucUg6PdWPBralfwRXUvxvW0lE7AFek/ShVLoSeKqfTUj7/su03+t6LFoG1O7qXExxShCSjpW0qpfXOULSUbVpiqO153t5rbnAw6X6nHSX6WxgT0Rso/j5zEjXlMam11rRY39HAkdHxGMU4XxqWnQ0+wJpqHelLi09/7S8ICJ+DfxK0qdTH5J0app+T0Q8ExH/EdjJ/kHZlny61GTp9GZhL4tuAW4D1ks6iOKQ/uPAt4AfSJpD8T/z64Pc5VzgDknvoLi+clV/K0uaBNwIbASeS5n3jYi4E/gu8LeSuoDdFN8iATAR2NvLy00AHkqvcTDwvYioHV0sAO6TdDWwBbgk1R8DZgFdFNdIrgKIiN2SbqH4JguAr0TE7h77Owp4WNJoiiPDL6b6zRSnMq8BTwDH9/cz6MNYSespjnwu72X5FcDtkv6M4hrPvcDPgb+SNDX1syrV2poHEre6U/HNEv87Itrya2skbQY6IqKVvz2yYXwkY3UXEd9odg82cvhIxsyy8oVfM8vKIWNmWbVlyMycOTMo3rvghx9+DP8xLG0ZMrt2+aK/2UjRliFjZiOHQ8bMsnLImFlWfjPeAWrK/Ef3m9+84GN9rGk2PD6SMbOsHDJmlpVDxsyycsiYWVYOGTPLyiFjZlk5ZMwsK4eMmWXlkDGzrBwyZpaVQ8bMsnLImFlWDhkzy8ohY2ZZOWTMLCuPJ2PA28eXAY8xY/XhIxkzy8ohY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLLKFjKSFkvaIen5Um2cpJWSNqXnsakuSQsldUlaL+mM0jZz0/qbJM3N1a+Z5ZHzSOYuYGaP2nxgVURMBValeYALganpMQ+4HYpQAm4CzgKmATfVgsnMWkO2kImIHwO7e5RnA0vS9BLgolJ9aRSeBsZImgh8FFgZEbsj4jVgJW8PLjMbwRp9TWZCRGxL068AE9L0scDLpfW6U62v+ttImiepU1Lnzp0769u1mQ1Z0y78RkQAUcfXWxQRHRHRMX78+Hq9rJkNU6NDZns6DSI970j1rcBxpfUmpVpfdTNrEY0OmWVA7Q7RXODhUn1Oust0NrAnnVatAGZIGpsu+M5INTNrEdmGepB0DzAdOEZSN8VdogXAfZKuBrYAl6TVHwNmAV3AG8BVABGxW9ItwNq03lcioufFZDMbwbKFTERc3seij/SybgBf6ON1FgOL69jaAae3sWLMGsXv+DWzrBwyZpZVpZCRdG6VmplZT1WPZL5esWZmtp9+L/xK+iBwDjBe0hdLi94JjMrZmJm1h4HuLh0KHJnWO6pU/zVwca6mzKx99BsyEfEU8JSkuyJiS4N6MrM2UvV9ModJWgRMKW8TER/O0ZSZtY+qIXM/cAdwJ/BmvnbMrN1UDZm9EXF71k7MrC1VvYX9iKTPS5qYhtAcl0atMzPrV9Ujmdonp68v1QI4ob7tmFm7qRQyEXF87kbMrD1VChlJc3qrR8TS+rZjZu2m6unSmaXp0RTDNTwHOGTMrF9VT5euLc9LGgPcm6MhM2svQx3q4XXA12nMbEBVr8k8wr5vFhgFvA+4L1dTZtY+ql6TubU0vRfYEhHdGfoxszZT6XQpfVByI8UnsccCf8jZlJm1j6oj410CrAE+TfENA89I8lAPZjagqqdLNwJnRsQOAEnjgceBB3I1ZmbtoWrIHFQLmORVPAh52+vtq1Q2L/hYEzqxVlY1ZJZLWgHck+YvpfhCNjOzfg00xu+JwISIuF7Sp4Dz0qKfAnfnbs7MWt9ARzK3AV8GiIgHgQcBJJ2Sln0iY29m1gYGuq4yISI29Cym2pQsHZlZWxkoZMb0s+zwOvZhZm1qoJDplPTZnkVJnwGeHepOJW2WtEHSOkmdqTZO0kpJm9Lz2FSXpIWSuiStl3TGUPdrZo030DWZ64CHJF3BvlDpoPg+pk8Oc9/nR8Su0vx8YFVELJA0P83fAFwITE2Ps4Db07OZtYCBvndpO3COpPOBk1P50Yh4IkMvs4HpaXoJ8CRFyMwGlkZEAE9LGiNpYkRsy9CDmdVZ1fFkVgOr67jfAH4kKYBvR8QiiovMteB4BZiQpo8FXi5t251q+4WMpHnAPIDJkyfXsVUzG46qb8art/MiYqukdwMrJW0sL4yISAFUWQqqRQAdHR2D2tbM8mlKyETE1vS8Q9JDwDRge+00SNJEoPYxhq3AcaXNJ6Wa9aK3jwKYNVPDP38k6QhJR9WmgRnA88Ay9n31ylzg4TS9DJiT7jKdDezx9Riz1tGMI5kJFHesavv/XkQsl7QWuE/S1cAWiiEloPiM1CygC3gDuKrxLZvZUDU8ZCLiJeDUXuqvUnwLQs96AF9oQGtmloGHazCzrBwyZpaVQ8bMsnLImFlWDhkzy8ohY2ZZOWTMLCuHjJll5ZAxs6wcMmaWlUPGzLJyyJhZVg4ZM8uqWSPjWYvy92PbYPlIxsyycsiYWVYOGTPLyiFjZlk5ZMwsK4eMmWXlkDGzrBwyZpaVQ8bMsnLImFlW/lhBC/P3Xlsr8JGMmWXlIxkbNn9o0vrjIxkzy6plQkbSTEkvSuqSNL/Z/ZhZNS1xuiRpFPBN4AKgG1graVlE/KK5nVlfep5C+fTpwNUSIQNMA7oi4iUASfcCs4EDJmR8J8laVauEzLHAy6X5buCs8gqS5gHz0uxvJb1Y4XWPAXbVpcPmaJn+9dVeyy3Tfx8OlP6XR8TMoe6kVUJmQBGxCFg0mG0kdUZER6aWsnP/zeX+q2mVC79bgeNK85NSzcxGuFYJmbXAVEnHSzoUuAxY1uSezKyCljhdioi9kq4BVgCjgMUR8UIdXnpQp1cjkPtvLvdfgSKiEfsxswNUq5wumVmLcsiYWVYHZMiMtI8oSNosaYOkdZI6U22cpJWSNqXnsakuSQtT7+slnVF6nblp/U2S5pbq/0d6/a60rerQ82JJOyQ9X6pl77mvfdSp/5slbU1/D+skzSot+3Lq5UVJHy3Ve/1dSjcpnkn176cbFkg6LM13peVThtD7cZJWS/qFpBck/bv+fjZN//lHxAH1oLhw/EvgBOBQ4OfA+5vc02bgmB61/wzMT9Pzga+m6VnAfwMEnA08k+rjgJfS89g0PTYtW5PWVdr2wjr0/CfAGcDzjey5r33Uqf+bgS/1su770+/JYcDx6fdnVH+/S8B9wGVp+g7gc2n688Adafoy4PtD6H0icEaaPgr4+9TjiPz5N/0ffaMfwAeBFaX5LwNfbnJPm3l7yLwITCz9Ur2Ypr8NXN5zPeBy4Nul+rdTbSKwsVTfb71h9j2lxz/S7D33tY869X8zvYfMfr8jFHc5P9jX71L6h7kLOLjn71xt2zR9cFpPw/x7eJjic30j8ud/IJ4u9fYRhWOb1EtNAD+S9KyKj0cATIiIbWn6FWBCmu6r//7q3b3Uc2hEz33to16uSacUi0unAoPt/13AP0bE3l76/+M2afmetP6QpNOt04FnGKE//wMxZEai8yLiDOBC4AuS/qS8MIr/NlrqvQaN6DnDPm4H3gOcBmwD/ksdX7vuJB0J/AC4LiJ+XV42kn7+B2LIjLiPKETE1vS8A3iI4lPn2yVNBEjPO9LqffXfX31SL/UcGtFzX/sYtojYHhFvRsRbwHco/h6G0v+rwBhJB/eo7/daafnRaf1BkXQIRcDcHREPpvKI/PkfiCEzoj6iIOkISUfVpoEZwPOpp9rV/rkU592k+px0x+BsYE86fF0BzJA0Nh3mz6C4DrAN+LWks9Mdgjml16q3RvTc1z6GrfaPJ/kkxd9DbZ+XpTtDxwNTKS6M9vq7lP6HXw1c3MfPotb/xcATaf3B9Cngu8D/ioj/Wlo0Mn/+9bgA2GoPiqvtf09xZ+DGJvdyAsVdiZ8DL9T6oThPXwVsAh4HxqW6KAbw+iWwAegovdb/A3Slx1WlegfFP5hfAt9gmBca02veQ3FK8c8U5+xXN6LnvvZRp/7/NvW3Pv1jmlha/8bUy4uU7s719buU/l7XpD/X/cBhqT46zXel5ScMoffzKE5T1gPr0mPWSP35+2MFZpbVgXi6ZGYN5JAxs6wcMmaWlUPGzLJyyJhZVg4ZGxJJv212D9YaHDJmlpVDxoZF0nRJT0p6QNJGSXeXxh45U9L/lPRzSWskHSVptKS/SWOV/EzS+WndfyPp/0tjlGyWdI2kL6Z1npY0Lq33HknL04dJfyLppGb++W1gLTGQuI14pwP/CvgH4H8A50paA3wfuDQi1kp6J/BPwL+j+GzdKSkgfiTpvel1Tk6vNZriHag3RMTpkr5G8db22ygGv/63EbFJ0lnAt4APN+oPaoPnkLF6WBMR3QCS1lGM07IH2BYRawEifUpY0nnA11Nto6QtQC1kVkfEb4DfSNoDPJLqG4APpE8dnwPcr32D+x2W949mw+WQsXr4fWn6TYb+e1V+nbdK82+l1zyIYpyW04b4+tYEviZjubwITJR0JkC6HnMw8BPgilR7LzA5rTugdDT0K0mfTttL0qk5mrf6cchYFhHxB+BS4OuSfg6spLjW8i3gIEkbKK7Z/JuI+H3fr/Q2VwBXp9d8AZhd386t3vwpbDPLykcyZpaVQ8bMsnLImFlWDhkzy8ohY2ZZOWTMLCuHjJll9f8Dy2zwTttaswsAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# plot resulting statistic distributions\n", "g = sns.FacetGrid(results, col=\"type\", col_wrap=1, height=2, aspect=2)\n", "g.map(plt.hist, \"income\", range=[0, 200000], bins=40)\n", "g.set_axis_labels(\"Income\", \"Count\")\n", "g.set_titles(\"{col_name}\")\n", "\n", "plt.tight_layout();" ] }, { "cell_type": "markdown", "id": "71fe77d1-9230-4323-981f-be6a03607f13", "metadata": {}, "source": [ "## Bootstrap" ] }, { "cell_type": "markdown", "id": "6974ded2-d837-4cee-96df-9f772c73d905", "metadata": {}, "source": [ "A bootstrap can be computed using scikit-learn's resample method (sampling with replacement)." ] }, { "cell_type": "code", "execution_count": 8, "id": "48249d77-0dfc-4130-9406-258941dab262", "metadata": {}, "outputs": [], "source": [ "def bootstrap(data, function, no_draws):\n", " \"\"\"function draw no_draws samples of data, applies func and stores result in array\"\"\"\n", " results = []\n", " for nrepeat in range(no_draws):\n", " sample = resample(data, replace=True)\n", " results.append(function(sample))\n", "\n", " # convert to pandas Series for easier statistic computation\n", " return pd.Series(results)" ] }, { "cell_type": "code", "execution_count": 9, "id": "1f28124d-5d2f-489a-bbb3-092dfba89aa6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8/fFQqAAAACXBIWXMAAAsTAAALEwEAmpwYAAARUElEQVR4nO3dfbBcdX3H8feHRAQUCUhIkYAJFUFa5emCOGpVKA9KFaxIsVozSI2jtgPaaQlqWzudzoQ+odZqpWKNtiqK8qBoNSDotKNAAsiDgEQMGh6jwoCMA41++8f+YrbpvTd7wz27yc37NbNzz/ntOXu++9uTfPY8bqoKSdK2bbtRFyBJGj3DQJJkGEiSDANJEoaBJAmYPeoCBrH77rvXggULRl2GJG1VVq5c+eOqmjvItFtFGCxYsIAVK1aMugxJ2qokuWvQad1NJEnqdssgyWrgEeAXwLqqGkuyG3ABsABYDZxSVQ92WYckaXLD2DJ4WVUdXFVjbXwJcEVV7Qdc0cYlSSM0it1EJwLL2vAy4KQR1CBJ6tN1GBTwtSQrkyxubfOq6t42fB8wb7wZkyxOsiLJirVr13ZcpiRt27o+m+hFVXV3kj2A5Ulu63+yqirJuHfKq6rzgPMAxsbGvJueJHWo0y2Dqrq7/X0AuAg4Arg/yZ4A7e8DXdYgSdq0zsIgyVOS7Lx+GDgWuBm4FFjUJlsEXNJVDZKkwXS5m2gecFGS9cv5VFX9Z5Jrgc8mOR24CzilwxokSQPoLAyq6k7goHHafwIc3dVytW1asOSykSx39dITRrJcabp5BbIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEliK/mlM2lLNarrG8BrHDS93DKQJBkGkiTDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJIkhhEGSWUmuT/KlNr4wydVJViW5IMn2XdcgSZrcMLYMzgBu7Rs/Bzi3qp4FPAicPoQaJEmT6DQMkswHTgA+2sYDHAVc2CZZBpzUZQ2SpE3resvgfcCfAb9s408HHqqqdW18DbDXeDMmWZxkRZIVa9eu7bhMSdq2dRYGSX4HeKCqVm7O/FV1XlWNVdXY3Llzp7k6SVK/2R2+9guBVyV5BbAD8DTg/cCcJLPb1sF84O4Oa5AkDaCzLYOqOruq5lfVAuBU4OtV9XrgSuDkNtki4JKuapAkDWYU1xmcBbwzySp6xxDOH0ENkqQ+Xe4m+pWqugq4qg3fCRwxjOVKkgbjFciSJMNAkmQYSJIwDCRJGAaSJAwDSRJDOrVU0vRbsOSykSx39dITRrJcdcstA0mSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgSWLAMEjy3K4LkSSNzqBbBh9Kck2StyXZpdOKJElDN1AYVNWLgdcDewMrk3wqyTGTzZNkhxYg30lyS5K/au0Lk1ydZFWSC5Js/4TfhSTpCRn4mEFV3QG8BzgLeAnwgSS3JfndCWZ5DDiqqg4CDgaOT3IkcA5wblU9C3gQOP0J1C9JmgaDHjN4XpJzgVuBo4BXVtVz2vC5481TPT9ro09qj2rzXNjalwEnbXb1kqRpMeiWwT8B1wEHVdXbq+o6gKq6h97WwriSzEpyA/AAsBz4PvBQVa1rk6wB9ppg3sVJViRZsXbt2gHLlCRtjkHD4ATgU1X1c4Ak2yXZCaCqPjnRTFX1i6o6GJgPHAEcMGhhVXVeVY1V1djcuXMHnU2StBkGDYPLgR37xndqbQOpqoeAK4EXAHOSzG5PzQfuHvR1JEndGDQMdujb/08b3mmyGZLMTTKnDe8IHEPvmMOVwMltskXAJVOsWZI0zQYNg0eTHLp+JMlhwM83Mc+ewJVJbgSuBZZX1ZfonY30ziSrgKcD50+9bEnSdJq96UkAOBP4XJJ7gAC/BvzeZDNU1Y3AIeO030nv+IEkaQsxUBhU1bVJDgD2b023V9X/dFeWJGmYBt0yADgcWNDmOTQJVfWJTqqSJA3VQGGQ5JPArwM3AL9ozQUYBpI0Awy6ZTAGHFhV1WUxkqTRGPRsopvpHTSWJM1Ag24Z7A58N8k19G5AB0BVvaqTqiRJQzVoGLy3yyIkSaM16Kml30jyTGC/qrq83ZdoVrelSZKGZdBbWL+Z3m2nP9Ka9gIu7qgmSdKQDXoA+e3AC4GH4Vc/dLNHV0VJkoZr0DB4rKoeXz/S7jrqaaaSNEMMGgbfSPIuYMf228efA77YXVmSpGEaNAyWAGuBm4C3AF9mkl84kyRtXQY9m+iXwL+2hyRphhn03kQ/YJxjBFW177RXJEkauqncm2i9HYDXArtNfzmSpFEY6JhBVf2k73F3Vb0POKHb0iRJwzLobqJD+0a3o7elMJXfQpAkbcEG/Q/9H/qG1wGrgVOmvRpJ0kgMejbRy7ouRNLWYcGSy0a27NVL3TvdlUF3E71zsuer6h+npxxJ0ihM5Wyiw4FL2/grgWuAO7ooSpI0XIOGwXzg0Kp6BCDJe4HLquoNXRUmSRqeQW9HMQ94vG/88dYmSZoBBt0y+ARwTZKL2vhJwLJOKpIkDd2gZxP9TZKvAC9uTadV1fXdlSVJGqZBdxMB7AQ8XFXvB9YkWdhRTZKkIRv0Zy//EjgLOLs1PQn4966KkiQN16DHDF4NHAJcB1BV9yTZubOqtFUa5cVIkp6YQXcTPV5VRbuNdZKndFeSJGnYBg2Dzyb5CDAnyZuBy/GHbiRpxtjkbqIkAS4ADgAeBvYH/qKqlndcmyRpSDYZBlVVSb5cVc8FDABJmoEG3U10XZLDp/LCSfZOcmWS7ya5JckZrX23JMuT3NH+7jrlqiVJ02rQMHg+8O0k309yY5Kbkty4iXnWAX9SVQcCRwJvT3IgsAS4oqr2A65o45KkEZp0N1GSfarqh8BxU33hqroXuLcNP5LkVmAv4ETgpW2yZcBV9K5hkCSNyKaOGVxM726ldyX5fFW9ZnMWkmQBvesUrgbmtaAAuI8JbniXZDGwGGCfffbZnMVKkga0qd1E6Rved3MWkOSpwOeBM6vq4f7n+q9d2FhVnVdVY1U1Nnfu3M1ZtCRpQJsKg5pgeCBJnkQvCP6jqr7Qmu9Psmd7fk/ggam+riRpem0qDA5K8nCSR4DnteGHkzyS5OHJZmzXJ5wP3LrRz2JeCixqw4uASza3eEnS9Jj0mEFVzXoCr/1C4A+Am5Lc0NreBSyld0Xz6cBdwClPYBmSpGkw6I3qpqyq/ov/e8yh39FdLVeSNHVT+T0DSdIMZRhIkgwDSZJhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEh2GQZKPJXkgyc19bbslWZ7kjvZ3166WL0kaXJdbBh8Hjt+obQlwRVXtB1zRxiVJI9ZZGFTVN4GfbtR8IrCsDS8DTupq+ZKkwQ37mMG8qrq3Dd8HzBvy8iVJ4xjZAeSqKqAmej7J4iQrkqxYu3btECuTpG3PsMPg/iR7ArS/D0w0YVWdV1VjVTU2d+7coRUoSduiYYfBpcCiNrwIuGTIy5ckjaPLU0s/DXwL2D/JmiSnA0uBY5LcAfx2G5ckjdjsrl64ql43wVNHd7VMSdLm8QpkSZJhIEkyDCRJdHjMQKOzYMlloy5B0lbGLQNJkmEgSTIMJEl4zEDSVmRUx8NWLz1hJMsdJrcMJEmGgSTJMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJf9ymM/4ovaStiVsGkiTDQJJkGEiS8JiBJG3SqI4Brl56wtCW5ZaBJMkwkCQZBpIkDANJEtvAAWQv/pKkTRvJlkGS45PcnmRVkiWjqEGStMHQwyDJLOCfgZcDBwKvS3LgsOuQJG0wii2DI4BVVXVnVT0OfAY4cQR1SJKaURwz2Av4Ud/4GuD5G0+UZDGwuI3+LMntQ6htVHYHfjzqIrYQ9sUG9sUG22Rf5Jxxm6fSF88cdFlb7AHkqjoPOG/UdQxDkhVVNTbqOrYE9sUG9sUG9sUGXfXFKHYT3Q3s3Tc+v7VJkkZkFGFwLbBfkoVJtgdOBS4dQR2SpGbou4mqal2SPwK+CswCPlZVtwy7ji3MNrE7bED2xQb2xQb2xQad9EWqqovXlSRtRbwdhSTJMJAkGQbTKsmcJBcmuS3JrUlekOTgJN9OckOSFUmO2Giew5OsS3JyX9uiJHe0x6K+9sOS3NRu4/GBJBnm+5uKqfZFkpe29luSfKOvfdxbl7QTEK5u7Re0kxG2SFPpiyS7JPliku+0vjit73Vm6npxUJJvtffwxSRP65v+7Pa+bk9yXF/7TF0vxu2LJMckWdnaVyY5qu91xv38k+yWZHlbX5Yn2XXSgqrKxzQ9gGXAH7bh7YE5wNeAl7e2VwBX9U0/C/g68GXg5Na2G3Bn+7trG961PXcNcCQQ4CvrX3dLfEylL9pz3wX2aeN79PXP94F922t8BziwPfdZ4NQ2/C/AW0f9nqepL94FnNOG5wI/bfPM5PXiWuAlre1NwF+34QPbZ/5kYGFbF2bN8PVior44BHhGG/5N4O6+1xn38wf+FljShpesX68merhlME2S7AL8FnA+QFU9XlUPAQWs/6azC3BP32x/DHweeKCv7ThgeVX9tKoeBJYDxyfZE3haVX27ep/uJ4CTuntHm28z+uL3gS9U1Q/b9Ov7Y9xbl7RvPkcBF7bpljFz+qKAndt7fCq9MFjHzF4vng18s022HHhNGz4R+ExVPVZVPwBW0VsnZvJ6MW5fVNX1VbV+HbkF2DHJkzfx+Z9Irw9ggL4wDKbPQmAt8G9Jrk/y0SRPAc4E/i7Jj4C/B84GSLIX8Grgwxu9zni369irPdaM074lmlJf0PsHsGuSq9om8Btb+0R98XTgoapat1H7lmiqffFB4Dn0wuEm4Iyq+iUze724hQ33J3stGy5Knew9z9T1YqK+6Pca4LqqeozJP/95VXVvG74PmDdZQYbB9JkNHAp8uKoOAR6lt2n2VuAdVbU38A7aNwHgfcBZ7R/6TDPVvpgNHAacQO8b8J8nefbQq+7GVPviOOAG4BnAwcAH+/ehb+Um6os3AW9LshLYGXh8dCUOzWb1RZLfAM4B3jKVhbWthkmvIzAMps8aYE1VXd3GL6T3YS8CvtDaPkdvExdgDPhMktXAycCHkpzExLfruLsNb9y+JZpqX6wBvlpVj1bVj+ltJh/ExH3xE2BOktkbtW+JptoXp9HbZVZVtQr4AXAAM3i9qKrbqurYqjoM+DS94wEw+XuekevFJH1BkvnARcAbq6q/jyb6/O9vu5Fof/t3R/8/hsE0qar7gB8l2b81HU3voOg9wEta21HAHW36hVW1oKoW0FsR3lZVF9O7MvvYJLu2o//H0vuP8l7g4SRHtn2jbwQuGc67m5qp9gW99/GiJLOT7ETvLra3MsGtS9q3nCvphSj0/mOdKX3xwzYNSeYB+9M7WDxj14skewAk2Q54D70Dv9C7Tc2pbd/4QmA/egdLZ+x6MVFfJJkDXEbvgPB/973OZJ//pfT6AAbpi2EcNd9WHvQ261cANwIX0zvr40XASnpnPFwNHDbOfB+nnU1UG84iWNUep/W1jwE30/u28EHaFeRb4mOqfQH8Kb3/JG8GzuxrfwXwvfae393Xvi+9/xhW0ftm/eRRv+fp6At6u4e+Ru94wc3AG7aB9eKM9hl/D1jaXz/w7va+bqfvLKkZvF6M2xf0guFRersQ1z/Wn3U37udP7xjKFfS+aFwO7DZZPd6OQpLkbiJJkmEgScIwkCRhGEiSMAwkSRgGkiQMA0kS8L8l5uzAQTba7AAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# also: we can use this example to showcase the central limit theorem (again), changing the number of draws\n", "mean_distribution = bootstrap(loans_income, np.mean, 200)\n", "mean_distribution.plot.hist()" ] }, { "cell_type": "code", "execution_count": 10, "id": "3502af3a-32ba-4ce5-9a9d-d9c1c1b34ef3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bootstrap Statistics:\n", "original data: 68760.51844\n", "std. error: 144.6409686847048\n" ] } ], "source": [ "print(\"Bootstrap Statistics:\")\n", "print(f\"original data: {loans_income.mean()}\")\n", "# we compute standard error of means via standard deviation\n", "# but: do not confuse the two\n", "print(f\"std. error: {mean_distribution.std()}\")" ] }, { "cell_type": "markdown", "id": "6827de04-fcd7-4a13-a9fb-0e507e9b21b3", "metadata": {}, "source": [ "## Confidence Interval" ] }, { "cell_type": "code", "execution_count": 11, "id": "8eecd61a-5472-4f66-8403-b9d47814fb6d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "68760.51844\n" ] }, { "data": { "text/plain": [ "0.05 68515.351217\n", "0.95 68966.071996\n", "dtype: float64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(loans_income.mean())\n", "# create a sample of 20 loan income data\n", "mean_distribution = bootstrap(loans_income, np.mean, 200)\n", "mean_distribution.quantile([0.05, 0.95])\n", "confidence_interval = list(mean_distribution.quantile([0.05, 0.95]))" ] }, { "cell_type": "code", "execution_count": 12, "id": "c8cf6a73-a40b-4454-b1b6-91b0189374d4", "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsgAAAI4CAYAAAB3OR9vAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8/fFQqAAAACXBIWXMAAAsTAAALEwEAmpwYAAA0yklEQVR4nO3debhdVX0/4M8iYUZMmJEwViahEuQiYlEiARFEQUFwQFHQKCgaf6JC1aLFVixpxWKhRqnEqgVkEEG0QiBAKwoJhDKPgoBMAQJCwhBYvz/uIW7wBO4NOXdneN/nuc/dZ52z9/6elXNPPnfdtdcptdYAAAD9lmq7AAAAWJgIyAAA0CAgAwBAg4AMAAANAjIAADQIyAAA0NDTgFxK+Wwp5dpSyjWllP8qpSxXStmwlPK7UsotpZRTSinL9LIGAAAYjJ4F5FLKOkk+naSv1rplkmFJ3pvkm0m+VWt9dZKHkxzUqxoAAGCwej3FYniS5Uspw5OskOSeJDslOa1z/6Qke/W4BgAAGLDhvTpwrfXuUsqEJH9IMjvJr5NMSzKz1jqn87C7kqzTbf9Syrgk45JkxRVX3GazzTbrVakAACyBpk2bNqPWuvoL23sWkEspI5PsmWTDJDOT/DTJ2wa6f611YpKJSdLX11enTp3agyoBAFhSlVLu6NbeyykWOyf5fa31gVrr00nOSPI3SUZ0plwkyagkd/ewBgAAGJReBuQ/JHlDKWWFUkpJMjbJdUkuTLJP5zEHJDmrhzUAAMCg9Cwg11p/l/6L8a5IcnXnXBOTfDHJ/yul3JJk1SQn9qoGAAAYrJ7NQU6SWuuRSY58QfNtSV7fy/MCAMD88kl6AADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANAjIAADQICAzaDNnzsw+++yTzTbbLJtvvnkuvfTSTJ8+PW94wxsyevTo9PX15bLLLkuSTJkyJa985SszevTojB49On//938/9zgHHnhg1lhjjWy55ZbPO/5Xv/rVrLPOOnP3Offcc4f0+QH00mDeQx9++OG8613vymtf+9q8/vWvzzXXXPOix3nOcccdl8022yxbbLFFvvCFLwz5c4RFXq21J19JNk0yvfH1aJLxSVZJcl6SmzvfR77UsbbZZpvKwuNDH/pQ/d73vldrrfXJJ5+sDz/8cN1ll13queeeW2ut9Re/+EXdcccda621XnjhhfXtb3971+NcdNFFddq0aXWLLbZ4XvuRRx5ZjznmmN49AYAWDeY99LDDDqtf/epXa621Xn/99XWnnXZ60ePUWusFF1xQx44dW5944olaa6333XffUDwtWCQlmVq7ZM+ejSDXWm+stY6utY5Osk2SWUnOTHJ4ksm11o2TTO7cZhHxyCOP5OKLL85BBx2UJFlmmWUyYsSIlFLy6KOPzn3Mq171qpc81pvf/OasssoqPa0XYGEy2PfQ6667LjvttFOSZLPNNsvtt9+e++67b57HSZITTjghhx9+eJZddtkkyRprrDGUTxEWC0M1xWJskltrrXck2TPJpE77pCR7DVENLAC///3vs/rqq+cjH/lItt5663z0ox/N448/nmOPPTaf//zns+666+awww7LN77xjbn7XHrppdlqq62y22675dprrx3Qeb7zne/kta99bQ488MA8/PDDvXo6AENqsO+hW221Vc4444wkyWWXXZY77rgjd9111zyPkyQ33XRTLrnkkmy33XbZcccdc/nll7f2fGFRNVQB+b1J/quzvWat9Z7O9r1J1uy2QyllXCllaill6gMPPDAUNTIAc+bMyRVXXJGDDz44V155ZVZcccUcffTROeGEE/Ktb30rd955Z771rW/NHdV43etelzvuuCNXXXVVDj300Oy1114veY6DDz44t956a6ZPn5611147n/vc53r8rACGxmDfQw8//PDMnDkzo0ePznHHHZett946w4YNm+dxnjvHQw89lN/+9rc55phjsu+++z439REYqG7zLhbkV5JlksxIfzBOkpkvuP/hlzqGOcgLj3vuuaeuv/76c29ffPHFdffdd68rr7xyffbZZ2uttT777LP1Fa94Rdf9119//frAAw/Mvf373//+L+YgN73U/QCLkpfzHvrss8/W9ddfvz7yyCPzPE6tte666671ggsumHvfRhttVO+///7ePCFYxGWo5yA37JbkilrrfZ3b95VS1k6Szvf7h6AGFpC11lor6667bm688cYkyeTJk/Oa17wmr3rVq3LRRRclSS644IJsvPHGSZJ777137sjFZZddlmeffTarrrrqi57jnnvumbt95pln/sUqFwCLqsG+h86cOTNPPfVUkuT73/9+3vzmN2fllVee53GSZK+99sqFF16YpH+6xVNPPZXVVlttSJ8nLOqGD8E53pc/T69Ikp8nOSDJ0Z3vZw1BDSxAxx13XD7wgQ/kqaeeykYbbZQf/OAH2XPPPfOZz3wmc+bMyXLLLZeJEycmSU477bSccMIJGT58eJZffvmcfPLJKaUkSd73vvdlypQpmTFjRkaNGpWvfe1rOeigg/KFL3wh06dPTyklG2ywQb773e+2+XQBFqjBvIdef/31OeCAA1JKyRZbbJETTzzxRY+T9C+heeCBB2bLLbfMMsssk0mTJs193wUGpjw3uteTg5eyYpI/JNmo1vpIp23VJKcmWS/JHUn2rbU+9GLH6evrq1OnTu1ZnQAALHlKKdNqrX0vbO/pCHKt9fEkq76g7cH0r2oBAAALHZ+kBwAADQIyAAA0CMgAANAgIAMAQIOADAAADQIyAAA0CMgAANAgIAMAQIOADAAADQIyAAA0CMgAANAgIAMAQIOADAAADQIyAAA0CMgAANAgIAMAQIOADAAADQIyAAA0CMgAANAgIAMAQIOADAAADQIyAAA0CMgAANAgIAMAQIOADAAADQIyAAA0CMgMifHjx2f8+PFtlwGwSPIeCkNreNsFsGSYPn162yUALLK8h8LQMoIMAAANAjIAADQIyAAA0CAgAwBAg4AMAAANAjIAADQIyAAA0CAgAwBAg4AMAAANAjIAADQIyAAA0CAgAwBAg4AMAAANAjIAADQIyAAA0CAgAwBAg4AMAAANAjIAADQIyAAA0CAgAwBAg4AMAAANAjIAADQIyAAA0CAgAwBAg4AMAAANAjIAADQIyAAA0CAgAwBAg4AMAAANAjKwUPn2t7+dLbfcMltssUWOPfbYue0PPfRQdtlll2y88cbZZZdd8vDDDydJTj/99GyxxRZ505velAcffDBJcuutt2a//fab5zl23333zJw580XrOOmkk/LHP/7xZT+fl/LVr341EyZM6Pl5ABi4ngbkUsqIUspppZQbSinXl1K2L6WsUko5r5Ryc+f7yF7WACw6rrnmmnzve9/LZZddlquuuirnnHNObrnlliTJ0UcfnbFjx+bmm2/O2LFjc/TRRydJjjvuuFx++eX5+Mc/np/85CdJki9/+cv5+te/Ps/znHvuuRkxYsSL1jI/AXnOnDmDejwAC6dejyB/O8mvaq2bJdkqyfVJDk8yuda6cZLJndsAuf7667PddttlhRVWyPDhw7PjjjvmjDPOSJKcddZZOeCAA5IkBxxwQH72s58lSZZaaqk8+eSTmTVrVpZeeulccsklWWuttbLxxhvP8zwbbLBBZsyYkdtvvz2bb755Pvaxj2WLLbbIW9/61syePTunnXZapk6dmg984AMZPXp0Zs+enWnTpmXHHXfMNttsk1133TX33HNPkmTMmDEZP358+vr68g//8A9Zf/318+yzzyZJHn/88ay77rp5+umn873vfS/bbrttttpqq+y9996ZNWtWD3sSgJejZwG5lPLKJG9OcmKS1FqfqrXOTLJnkkmdh01KslevagAWLVtuuWUuueSSPPjgg5k1a1bOPffc3HnnnUmS++67L2uvvXaSZK211sp9992XJDniiCOy88475+yzz8773ve+HHXUUfnKV74y4HPefPPN+eQnP5lrr702I0aMyOmnn5599tknfX19+fGPf5zp06dn+PDhOfTQQ3Paaadl2rRpOfDAA/OlL31p7jGeeuqpTJ06NUceeWRGjx6diy66KElyzjnnZNddd83SSy+dd7/73bn88stz1VVXZfPNN8+JJ564oLoNgAVseA+PvWGSB5L8oJSyVZJpST6TZM1a6z2dx9ybZM1uO5dSxiUZlyTrrbdeD8sEFhabb755vvjFL+atb31rVlxxxYwePTrDhg37i8eVUlJKSZLssssu2WWXXZIkP/zhD7P77rvnpptuyoQJEzJy5Mh8+9vfzgorrDDPc2644YYZPXp0kmSbbbbJ7bff/hePufHGG3PNNdfMPc8zzzwzN6wned585/322y+nnHJK3vKWt+Tkk0/OIYcckqR/+siXv/zlzJw5M4899lh23XXXwXUOAEOml1Mshid5XZITaq1bJ3k8L5hOUWutSWq3nWutE2utfbXWvtVXX72HZQILk4MOOijTpk3LxRdfnJEjR2aTTTZJkqy55ppzpzXcc889WWONNZ6336xZs3LSSSflk5/8ZI488shMmjQpO+ywQ3784x+/6PmWXXbZudvDhg3rOo+41potttgi06dPz/Tp03P11Vfn17/+9dz7V1xxxbnb73znO/OrX/0qDz30UKZNm5addtopSfLhD3843/nOd3L11VfnyCOPzBNPPDHIngFgqPQyIN+V5K5a6+86t09Lf2C+r5SydpJ0vt/fwxqARcz99/e/JfzhD3/IGWeckfe///1J+oPnpEn9s7MmTZqUPffc83n7HXPMMfn0pz+dpZdeOrNnz04pJUsttdR8z/V9xStekT/96U9Jkk033TQPPPBALr300iTJ008/nWuvvbbrfiuttFK23XbbfOYzn8kee+wxdwT8T3/6U9Zee+08/fTTLxnaAWhXz6ZY1FrvLaXcWUrZtNZ6Y5KxSa7rfB2Q5OjO97N6VQOw6Nl7773z4IMPZumll86//du/zV1t4vDDD8++++6bE088Meuvv35OPfXUufv88Y9/zGWXXZYjjzwySXLooYdm2223zYgRI+ZezDdYH/7wh/OJT3wiyy+/fC699NKcdtpp+fSnP51HHnkkc+bMyfjx47PFFlt03Xe//fbLe97znkyZMmVu21FHHZXtttsuq6++erbbbru54RuAhU/pn+XQo4OXMjrJ95Msk+S2JB9J/6j1qUnWS3JHkn1rrQ+92HH6+vrq1KlTe1YnvTdmzJgkeV5gAGBgvIdCb5RSptVa+17Y3suL9FJrnZ7kL06a/tFkAABY6PgkPQAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgYXjbBbBkuOWWW/LYY49lzJgxbZcCsMiZPn16VlpppbbLgCWGEWQAAGgwgsyQePWrX50kmTJlSruFACyC/PUNhpYRZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABoEZAAAaBCQAQCgQUAGAIAGARkAABqG9/LgpZTbk/wpyTNJ5tRa+0opqyQ5JckGSW5Psm+t9eFe1gEAAAM1FCPIb6m1jq619nVuH55kcq114ySTO7cBAGCh0MYUiz2TTOpsT0qyVws1AABAV70OyDXJr0sp00op4zpta9Za7+ls35tkzW47llLGlVKmllKmPvDAAz0uE+D5ZsyYkRkzZrRdBgAt6Okc5CQ71FrvLqWskeS8UsoNzTtrrbWUUrvtWGudmGRikvT19XV9DECvrLbaam2XAEBLejqCXGu9u/P9/iRnJnl9kvtKKWsnSef7/b2sAWB+nHTSSTnppJPaLgOAFvQsIJdSViylvOK57SRvTXJNkp8nOaDzsAOSnNWrGgDml4AMsOTq5RSLNZOcWUp57jw/qbX+qpRyeZJTSykHJbkjyb49rAEAAAalZwG51npbkq26tD+YZGyvzgsAAC+HT9IDAIAGARkAABp6vcwbwCLp3HPPbbsEAFoiIAN0scIKK7RdAgAtMcUCoIvjjz8+xx9/fNtlANACARmgi1NPPTWnnnpq22UA0AIBGQAAGgRkAABoEJABAKBBQAYAgAbLvAF0MWXKlLZLAKAlRpABAKBBQAboYsKECZkwYULbZQDQAgEZoItzzjkn55xzTttlANACARkAABoEZAAAaBCQAQCgwTJvAF0sv/zybZcAQEsEZIAufvnLX7ZdAgAtMcUCAAAaBGSALo466qgcddRRbZcBQAsEZIAuJk+enMmTJ7ddBgAtEJABAKBBQAYAgAYBGQAAGizzBtDFqquu2nYJALREQAbo4vTTT2+7BABaYooFAAA0CMgAXRxxxBE54ogj2i4DgBaYYgHQxaWXXtp2CQC0xAgyAAA0CMgAANAgIAMAQIM5yABdjBo1qu0SAGiJgAzQxY9+9KO2SwCgJaZYAABAg4AM0MX48eMzfvz4tssAoAWmWAB0MX369LZLAKAlRpABAKBBQAYAgAYBGQAAGsxBBuhik002absEAFoiIAN0MXHixLZLAKAlplgAAECDgAzQxbhx4zJu3Li2ywCgBaZYAHRx0003tV0CAC0Z9AhyKWVkKeW1vSgGAADaNqCAXEqZUkpZuZSySpIrknyvlPIvvS0NAACG3kBHkF9Za300ybuT/LDWul2SnXtXFgAAtGOgc5CHl1LWTrJvki/1sB6AhcLo0aPbLgGAlgw0IH8tyX8n+Z9a6+WllI2S3Ny7sgDadeyxx7ZdAgAtGWhAvqfWOvfCvFrrbeYgAwCwOBroHOTjBtgGsFjYf//9s//++7ddBgAteNER5FLK9knemGT1Usr/a9y1cpJhvSwMoE133XVX2yUA0JKXmmKxTJKVOo97RaP90ST79KooAABoy4sG5FrrRUkuKqWcVGu9Y4hqAgCA1gz0Ir1lSykTk2zQ3KfWulMvigIAgLYMNCD/NMm/J/l+kmd6Vw7AwmH77bdvuwQAWjLQgDyn1nrC/JyglDIsydQkd9da9yilbJjk5CSrJpmW5IO11qfm59gAvfKNb3yj7RIAaMlAl3k7u5RySCll7VLKKs99DXDfzyS5vnH7m0m+VWt9dZKHkxw0iHoBAKCnBhqQD0jy+SS/Sf+o77T0jwq/qFLKqCRvT//UjJRSSpKdkpzWecikJHsNqmKAIbD33ntn7733brsMAFowoCkWtdYN5/P4xyb5Qv68RNyqSWbWWud0bt+VZJ1uO5ZSxiUZlyTrrbfefJ4eYP48+OCDbZcAQEsGFJBLKR/q1l5r/eGL7LNHkvtrrdNKKWMGW1itdWKSiUnS19dXB7s/AADMj4FepLdtY3u5JGOTXJFkngE5yd8keWcpZffOPisn+XaSEaWU4Z1R5FFJ7h501QAA0CMDnWJxaPN2KWVE+leieLF9jkhyROfxY5IcVmv9QCnlp+n/FL6T0z+3+azBFg0AAL0y0BHkF3o8yfzOS/5ikpNLKV9PcmWSE+fzOAA9M3bs2LZLAKAlA52DfHaS5+YBD0uyeZJTB3qSWuuUJFM627clef1gigQYal/5ylfaLgGAlgx0BHlCY3tOkjtqrXf1oB4AAGjVgNZBrrVelOSG9C/XNjKJT74DFmu77bZbdtttt7bLAKAFAwrIpZR9k1yW5D1J9k3yu1LKPr0sDKBNs2fPzuzZs9suA4AWDHSKxZeSbFtrvT9JSimrJzk/f/5EPAAAWCwM9KOml3ouHHc8OIh9AQBgkTHQEeRflVL+O8l/dW7vl+Tc3pQEAADtedGAXEp5dZI1a62fL6W8O8kOnbsuTfLjXhcH0JY99tij7RIAaMlLjSAfm86n4dVaz0hyRpKUUv66c987elgbQGsOO+ywtksAoCUvNY94zVrr1S9s7LRt0JOKAACgRS8VkEe8yH3LL8A6ABYqY8aMyZgxY9ouA4AWvFRAnlpK+dgLG0spH00yrTclAQBAe15qDvL4JGeWUj6QPwfiviTLJHlXD+sCAIBWvGhArrXel+SNpZS3JNmy0/yLWusFPa8MAABaMKB1kGutFya5sMe1AABA6wb6QSEAS5R999237RIAaImADNDFIYcc0nYJALTkpVaxAFgizZo1K7NmzWq7DABaYAQZoIvdd989STJlypR2CwFgyBlBBgCABgEZAAAaBGQAAGgQkAEAoMFFegBdfPjDH267BABaIiADdCEgAyy5TLEA6GLGjBmZMWNG22UA0AIjyABd7LPPPkmsgwywJDKCDAAADQIyAAA0CMgAANAgIAMAQIOL9AC6OPjgg9suAYCWCMgAXey3335tlwBAS0yxAOjizjvvzJ133tl2GQC0wAgyQBcf/OAHk1gHGWBJZAQZAAAaBGQAAGgQkAEAoEFABgCABhfpAXTxuc99ru0SAGiJgAzQxTve8Y62SwCgJaZYAHRx44035sYbb2y7DABaYAQZoIuPf/zjSayDDLAkMoIMAAANAjIAADQIyAAA0CAgAwBAg4v0ALr48pe/3HYJALREQAboYuedd267BABaYooFQBfTp0/P9OnT2y4DgBYYQQboYvz48UmsgwywJDKCDAAADQIyAAA0CMgAANAgIAMAQIOL9AC6+Md//Me2SwCgJQIyQBdvfOMb2y4BgJaYYgHQxW9+85v85je/absMAFpgBBmgi7/9279NYh1kgCVRz0aQSynLlVIuK6VcVUq5tpTytU77hqWU35VSbimlnFJKWaZXNQAAwGD1corFk0l2qrVulWR0kreVUt6Q5JtJvlVrfXWSh5Mc1MMaAABgUHoWkGu/xzo3l+581SQ7JTmt0z4pyV69qgEAAAarpxfplVKGlVKmJ7k/yXlJbk0ys9Y6p/OQu5KsM499x5VSppZSpj7wwAO9LBMAAObq6UV6tdZnkowupYxIcmaSzQax78QkE5Okr6+v9qRAgHk49thj2y4BgJYMySoWtdaZpZQLk2yfZEQpZXhnFHlUkruHogaAwRg9enTbJQDQkl6uYrF6Z+Q4pZTlk+yS5PokFybZp/OwA5Kc1asaAObX+eefn/PPP7/tMgBoQS9HkNdOMqmUMiz9QfzUWus5pZTrkpxcSvl6kiuTnNjDGgDmy9e//vUkyc4779xyJQAMtZ4F5Frr/yXZukv7bUle36vzAgDAy+GjpgEAoEFABgCABgEZAAAahmSZN4BFzXe/+922SwCgJQIyQBebbrpp2yUA0BJTLAC6OPvss3P22We3XQYALTCCDNDFP//zPydJ3vGOd7RcCQBDzQgyAAA0CMgAANAgIAMAQIOADAAADS7SA+jiP//zP9suAYCWCMgAXay77rptlwBAS0yxAOjilFNOySmnnNJ2GQC0wAgyQBcnnHBCkmS//fZruRIAhpoRZAAAaBCQAQCgQUAGAIAGARkAABpcpAfQxWmnndZ2CQC0REAG6GK11VZruwQAWmKKBUAXJ510Uk466aS2ywCgBQIyQBcCMsCSS0AGAIAGARkAABoEZAAAaBCQAQCgwTJvAF2ce+65bZcAQEsEZIAuVlhhhbZLAKAlplgAdHH88cfn+OOPb7sMAFogIAN0ceqpp+bUU09tuwwAWiAgAwBAg4AMAAANAjIAADQIyAAA0GCZN4AupkyZ0nYJALTECDIAADQIyABdTJgwIRMmTGi7DABaICADdHHOOefknHPOabsMAFogIAMAQIOADAAADQIyAAA0WOYNoIvll1++7RIAaImADNDFL3/5y7ZLAKAlplgAAECDgAzQxVFHHZWjjjqq7TIAaIGADNDF5MmTM3ny5LbLAKAFAjIAADQIyAAA0CAgAwBAg2XeALpYddVV2y4BgJYIyABdnH766W2XAEBLTLEAAIAGARmgiyOOOCJHHHFE22UA0AJTLAC6uPTSS9suAYCWGEEGAIAGARkAABp6FpBLKeuWUi4spVxXSrm2lPKZTvsqpZTzSik3d76P7FUNAAAwWL0cQZ6T5HO11tckeUOST5ZSXpPk8CSTa60bJ5ncuQ2wUBk1alRGjRrVdhkAtKBnF+nVWu9Jck9n+0+llOuTrJNkzyRjOg+blGRKki/2qg6A+fGjH/2o7RIAaMmQzEEupWyQZOskv0uyZic8J8m9Sdacxz7jSilTSylTH3jggaEoEwAAeh+QSykrJTk9yfha66PN+2qtNUnttl+tdWKtta/W2rf66qv3ukyA5xk/fnzGjx/fdhkAtKCn6yCXUpZOfzj+ca31jE7zfaWUtWut95RS1k5yfy9rAJgf06dPb7sEAFrSy1UsSpITk1xfa/2Xxl0/T3JAZ/uAJGf1qgYAABisXo4g/02SDya5upQyvdP2t0mOTnJqKeWgJHck2beHNQAAwKD0chWL/0lS5nH32F6dFwAAXo6ezkEGWFRtsskmbZcAQEsEZIAuJk6c2HYJALRkSNZBBgCARYWADNDFuHHjMm7cuLbLAKAFplgAdHHTTTe1XQIALTGCDAAADQIyAAA0CMgAANBgDjJAF6NHj267BABaIiADdHHssce2XQIALTHFAgAAGgRkgC7233//7L///m2XAUALTLEA6OKuu+5quwQAWmIEGQAAGowgAwuVDQ7/xQI93u1Hv32BHg+AxZ8RZAAAaDCCDNDF9ttv33YJALREQAbo4hvf+EbbJQDQElMsAACgQUAG6GLvvffO3nvv3XYZALTAFAuALh588MG2SwCgJQIyLEEsobb4828M8PKZYgEAAA0CMgAANJhiAdDF2LFj2y4BgJYIyABdfOUrX2m7BABaYooFAAA0CMgAXey2227Zbbfd2i4DgBaYYgHQxezZs9suAYCWGEEGAIAGARkAABoEZAAAaDAHGaCLPfbYo+0SAGiJgAzQxWGHHdZ2CQC0RECGBWiDw3+xQI93+9FvX6DHY+GzoF8zALx85iADdDFmzJiMGTOm7TIAaIGADAAADQIyAAA0CMgAANAgIAMAQINVLAC62HfffdsuAYCWCMgAXRxyyCFd2y3LBrD4M8UCoItZs2Zl1qxZbZcBQAuMIAN0sfvuuydJpkyZ0m4hAAw5I8gAANAgIAMAQIOADAAADQIyAAA0uEgPoIsPf/jDbZdASxb0Un63H/32BXo8oPcEZIAuBGSAJZcpFgBdzJgxIzNmzGi7DABaYAQZoIt99tkniXWQAZZEAjKwUJnwnte2XUKS5PDDD+/a3mZ9d8+cnW+dd3Nr5wdYUgjIwELlrodnt11CkuTee+/t2t5mfaNGLt/auQGWJAIyAEPKKhHAws5FegAA0GAEGaCLvr6+tksAoCUCMrDQ+uwum2abse/I/odPSJI888ycHLnfDll/s63ysa9/t6fn3nLLLeduT5kyJePHj8/TTz+dJ4etmE/9y4/6208/Kb/95U9TSsnaG2yS933+G1l6mWXzr599f56c9XiS5LGZD2a9zV6bg752/Nzj/eHG/8u3P/3efPBL/5LRb35bkuTh+/+Yk//5y5n5wD0ppWTcP0zMKmuN6ulzBKC7ngXkUsp/JNkjyf211i07baskOSXJBkluT7JvrfXhXtUALNqWWW6F3PP7m/PUk09kmWWXy03T/jevXHXNITn3I488kiSpteaQQw7Jr371q6y33no56qeXJklmzrgvl/zsh/ni98/NMssul5OO+kyuvPAXef2u786nv/WTucf5wdcOzZZvHDv39rPPPJOzvz8hm27zN88734+/+cXs8v5PZNNt/iZPzn48pZgBB9CWXr4Dn5TkbS9oOzzJ5Frrxkkmd24DzNNrXr9jrvvdlCTJFRf+Iq97y58vyHpy9qz814Qj8q1P7ZMJn9grV//m/CTJQ/felX/97Psz4eB35XWve11+85vfJOkfCR4zZkz22WefbLbZZvnABz6QWmvX85555pk588wz85Of/CTvfve7s9566yVJXjFy1bmPefaZZ/L0k0/kmWfm5Oknn8jKq67xvGM88fhjuXn6b/PXb9x5btslZ/1nttph16w04s/HufeOW/LsM3PmhuZll18xyyxnxQqAtvQsINdaL07y0Aua90wyqbM9KclevTo/sHjY+i2758op5+bpp57MH2+7MetvvtXc+87/yb9n49FvyGe/c1o+OeGHOXviMXly9qysNGLVHPzNH+SwE87MKaeckk9/+tNz97nyyitz7LHH5rrrrsttt92W//3f/02S/N3f/V1+/vOf/8X5b7rppjz88MMZM2ZMttlmm1x+3s+SJCNWWzNj9jkwf/+Bt+TI/XbIciuulM36dnjevlf/5vxsvPX2WW7FlZL0jzpf/T/n543veN/zHvfAXbdn+ZVWzn989VOZ8Im98vOJ38yzzzyzQPoPgMEb6jnIa9Za7+ls35tknn8rLaWMSzIuydyRG2DhsiCX65rwntd2XWP4VRttlofuvStXXHBOXvP6HZ933w3T/ifX/PaCXPjT/0iSPP3Uk5l5/z1ZebU1cvqxf58/3npDjllqqTxw9+059vybcstVd2atV2+Z026YldxwS5ZabYP8+zmXZuoTa2SVN++f25Ice/5NSZLHHp6VJJn2+xm586ZrcvA/nZSnn3oi3/70e7P+5ltlpVeukmsunZyv/OfkLL/SK3LSUZ/J1PPPSt/Oe86t74oLz8kbdnvP3Ns/O/4fssdHD8tSSz1/bOKZZ+bktqun5nP//rOMXGPt/PDrn81lvz7jefsCMHRau0iv1lpLKd3/ttl//8QkE5Okr69vno8DFn9bbr9Tfj7xn/LJf/5hZj0683n3feTv/jVrrLvR89p+9cPj8oqRq+Ww756VWp/NF3b/86ffDV96mbnbSy017CVHal+5+lpZYeURWXb5FbLs8ivkr17blz/eekOSZNW1RmWlEaskSV67w1tz+3VXzg3Ijz3yUP5ww9U58Kv/NvdYd958TX74j/8vSfL4Iw/n+ssvyrBhwzNitbWyzl9tntXWXrf/+b5xbO64/qpkt8H0EgALylBfBXJfKWXtJOl8v3+Izw8sgrZ72z7Z9YOfzKs23PR57Ztts0Mu+dmP5s4jvuuW65Iksx//U1ZeZfUstdRSmXreWXn22fmfrvDX24/N76+ZlmeemZOnnpidO274v6y53l9l5Bqvyu3XX5WnnpidWmtuuvLSrLHeX83d76qL/zuvecOYLL3MsnPbvvKfF+TvftT/tdWbds3ehx6Zv/6bnbPepn+d2Y8/msdm9s9Ku2X677LW+q+e75oBeHmGegT550kOSHJ05/tZQ3x+YBE0YvW18uZ3fegv2nfZ/5D87IR/zDHj3pln67NZda1R+djXv5sd3vH+/ODvD83U83+WzfrelGWWW+Elz/HLk76ddTfZcu6KE8uus3mSZKVV1slm274px4x7Z8pSS+UNu+2TtTfcJEmy1Zt2zT8f8q4sNWx41vmrzfPG3febe7wrp5ybse/92ICe31LDhuWd476Y479wQFKTURtvkTfsbnoFQFvKvK7gftkHLuW/koxJslqS+5IcmeRnSU5Nsl6SO9K/zNsLL+T7C319fXXq1Kk9qZOhMWbMmCT9qwgszhb2j9Bd0PUtSPOag8yfjRq5fA776f8N6Tl78THOS9rPyYKob0l5D4WhVkqZVmv9i0+G6tkIcq31ffO4a+w82gEWGs/MejRJMmyFlVuuBIChZiV6gC5m33pZZt96WdtlANACHzUNwDwtzNNynrMo1AgsWowgAwBAgxFkYKFw98zZGTVy4fl45dmjXpUkWX4hqunumS5iBBgKAjKwUPjWeTe3XcLz3PuTryRJ1nr/0S1XAsBQE5ABunjlG9/bdgkAtERABuhi+Q1Gt10CAC0RkOmp564uv/e2B593e3714kMLFmauzm/PU/fdliRZZs2NWq4EgKFmFQuALh6aPDEPTZ7YdhkAtEBABgCABgEZAAAaBGQAAGgQkAEAoMEqFgBdjHjzAW2XAEBLBGSALpYbtXnbJQDQElMsALp44q7r88Rd17ddBgAtEJABuph58aTMvHhS22UA0AIBGQAAGgRkAABoEJABAKBBQAYAgAbLvDHXBof/ou0SXtKCrvH2o9++QI/H4mOVsePaLgGAlgjIAF0ss+ZGbZcAQEtMsQDoYvbt0zP79ultlwFAC4wgA3TxyG9OTpIsv8HodgsBYMgZQQYAgAYBGQAAGgRkAABoEJABAKDBRXoAXay666faLgGAlgjIAF0sveqotksAoCWmWAB0MeuW32XWLb9ruwwAWmAEGaCLRy87M0mywqu3a7kSAIaaEWQAAGgwggwAPbTB4b942ce497YH5x7r9qPf/rKP17Qg6mta0PVBG4wgAwBAg4AMAAANplgAdLHaHp9ruwQAWiIgA3QxfOXV2y4BgJaYYgHQxePXX5zHr7+47TIAaIERZIAu/nTluUmSFTd/c8uVADDUBOQXYembxd+C/jcGABZ9plgAAECDgAwAAA0CMgAANJiDDNDF6nsd0XYJALREQAboYtgKr2y7BABaIiADdPHY1ecnSVb6651brgSez+o70HvmIAN08djV588NyQAsWQRkAABoEJABAKBBQAYAgAYBGQAAGqxiAdDFGu/5atslANASAXkILeileW4/+u0L9HjAny219HJtlwCLpF4sQ7eg/79b0v4/XtiXBlwY+88UC4Au/nTFL/KnKxbu/1QA6A0BGaCLx2+4JI/fcEnbZQDQAgEZAAAaWgnIpZS3lVJuLKXcUko5vI0aAACgmyEPyKWUYUn+LcluSV6T5H2llNcMdR0AANBNGyPIr09yS631tlrrU0lOTrJnC3UAAMBfKLXWoT1hKfskeVut9aOd2x9Msl2t9VMveNy4JOM6NzdNcuMQlrlakhlDeL5Fmb4aOH01MPpp4PTVwOmrgdNXA6OfBm5h7qv1a62rv7BxoV0HudY6McnENs5dSplaa+1r49yLGn01cPpqYPTTwOmrgdNXA6evBkY/Ddyi2FdtTLG4O8m6jdujOm0AANC6NgLy5Uk2LqVsWEpZJsl7k/y8hToAAOAvDPkUi1rrnFLKp5L8d5JhSf6j1nrtUNfxElqZ2rGI0lcDp68GRj8NnL4aOH01cPpqYPTTwC1yfTXkF+kBAMDCzCfpAQBAg4AMAAANi3VALqWMKKWcVkq5oZRyfSll+1LK6FLKb0sp00spU0spr3/BPtuWUuZ01mt+ru2AUsrNna8DGu3blFKu7nxk9r+WUspQPr8FaTB9VUoZU0p5pNM+vZTyd43jdP0Y8c5Fmb/rtJ/SuUBzkTTY11Wnv6aXUq4tpVzUaF+s+2qQr6nPN15P15RSnimlrNK5b7Hup2TQffXKUsrZpZSrOq+pjzSO473q+X01spRyZinl/0opl5VStmwcZ0l9XW1VSrm083o4u5SycuPxR3Se942llF0b7Yt1Xw2mn0opq5ZSLiylPFZK+c4LjtP156yUskop5bzOz+V5pZSRbTzPBWGQfbVLKWVap31aKWWnxnEW/r6qtS62X0kmJfloZ3uZJCOS/DrJbp223ZNMaTx+WJILkpybZJ9O2ypJbut8H9nZHtm577Ikb0hSkvzyueMuil+D6askY5Kc0+UYw5LcmmSjzjGuSvKazn2nJnlvZ/vfkxzc9nMeor4akeS6JOt1bq+xpPTVYH/+Gvu9I8kFS0o/zcdr6m+TfLOzvXqShzr7eK/6y746JsmRne3Nkkz2usrlSXbstB2Y5KjO9ms6/bBskg07/TNsSeirQfbTikl2SPKJJN95wXG6/pwl+ackh3e2D3/u53dR/BpkX22d5FWd7S2T3L0o9dViO4JcSnllkjcnOTFJaq1P1VpnJqlJnvuN+ZVJ/tjY7dAkpye5v9G2a5Lzaq0P1VofTnJekreVUtZOsnKt9be1/1/yh0n26t0z6p357Ktuun6MeOc3w52SnNZ53KQsOX31/iRn1Fr/0Hn8c6+txbqvXuZr6n1J/quzvVj3UzJffVWTvKLTByulPyDPifeqbn31mvQPeqTWekOSDUopa2bJfl1tkuTizsPOS7J3Z3vPJCfXWp+stf4+yS3p76fFuq8G20+11sdrrf+T5IkXHOfFfs72TH//JItoPyXz1VdX1lqf+1m8NsnypZRlF5W+WmwDcvp/A34gyQ9KKVeWUr5fSlkxyfgkx5RS7kwyIckRSVJKWSfJu5Kc8ILjrJPkzsbtuzpt63S2X9i+KBpUX3VsX/r/xPvLUsoWnbZ59dWqSWbWWue8oH1RNNi+2iTJyFLKlM6fmD7UaV/c+2p+XlMppayQ5G3p/0U1Wfz7KRl8X30nyebpD4FXJ/lMrfXZeK/q1ldXJXl3kpT+aRfrp//DqZbk19W16Q8hSfKe/PmDu17s9bM499Vg+2leXuznbM1a6z2d7XuTrLlAKh96L6ev9k5yRa31ySwifbU4B+ThSV6X5IRa69ZJHk//cP3BST5ba103yWfT+U0oybFJvtj5j2ZJM9i+uiL9n12+VZLjkvxsyCtuz2D7aniSbZK8Pf0jfF8ppWwy5FUPvcH203PekeR/a60PDWWxLRtsX+2aZHqSVyUZneQ7pTGPdDE32L46OsmIUsr09P+F8Mokzwx10S2ZV18dmOSQUsq0JK9I8lR7JS4UhrSfOiOmi+r6uvPVV51BtG8m+fhgTtZ2Xy3OAfmuJHfVWn/XuX1a+v9hD0hyRqftp+n/81GS9CU5uZRye5J9khxfStkr8/5o7Ls72y9sXxQNqq9qrY/WWh/rbJ+bZOlSymqZd189mP7/pIa/oH1RNNjX1V1J/rvzZ7kZ6f8z1FZZ/PtqsP30nPfmz9MrksW/n5LB99VH0j9tp9Zab0ny+/TPr/Ve1e+F71UfqbWOTvKh9M/Zvi1L8Ouq1npDrfWttdZt0v+zdmvn/hd7/SzOfTXYfpqXF/s5u68zreC5qRj3Z9E06L4qpYxKcmaSD9Vam6+1hb6vFtuAXGu9N8mdpZRNO01j03+x1B+T7Nhp2ynJzZ3Hb1hr3aDWukH6/9EPqbX+LP2f+PfW0n819Mgkb01/4LknyaOllDd05mJ9KMlZQ/PsFqzB9lUpZa3GFaevT//r6MHM42PEO78FXpj+XzyS/v/Mloi+Sv/z3KGUMrwzfWC7JNdnMe+r+ein5+a37ZjnP9/Fup+S+eqrP3Qek8582k3TH/q8V/VrvleNKH9eWeGjSS6utT6aJfh1VUpZI0lKKUsl+XL6L65Lkp8neW9njuiGSTZO/4VUi3VfzUc/zes4L/Zz9vP090+yiPZTMvi+KqWMSPKL9F9097+N4ywafVUXgqsie/WV/j8/Tk3yf+mfBjAy/VefTkv/3LTfJdmmy34npbOKRf3zVZm3dL4+0mjvS3JN+n9b+k46n0y4KH4Npq+SfCr9c46uSvLbJG9sHGf3JDd1+uRLjfaN0v9me0v6R3iWbfs5D9XrKsnn0/+f+DVJxi8pfTUf/fTh9F8k9MLjLNb9NNi+Sv/Uil+nf/7xNUn2bxzHe9Xz+2r7zmvnxvSPMI/0uspnOs/7pvRPQSmNx3+p0x83prHSyeLeV/PRT7en/+LYx9I/qvrcqh5df87SP197cvp/cTs/ySptP+eh6Kv0h+XH0z8l7Lmv51ZyWuj7ykdNAwBAw2I7xQIAAOaHgAwAAA0CMgAANAjIAADQICADAECDgAwAAA0CMgAANPx/wiVj/tIZpaEAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "ax = mean_distribution.plot.hist(bins=30, figsize=(10, 8))\n", "ax.plot(confidence_interval, [55, 55], color=\"black\")\n", "for x in confidence_interval:\n", " ax.plot([x, x], [0, 65], color=\"black\")\n", " ax.text(\n", " x,\n", " 70,\n", " f\"{x:.0f}\",\n", " horizontalalignment=\"center\",\n", " verticalalignment=\"center\",\n", " )\n", "ax.text(\n", " sum(confidence_interval) / 2,\n", " 60,\n", " \"90% interval\",\n", " horizontalalignment=\"center\",\n", " verticalalignment=\"center\",\n", ")\n", "\n", "meanIncome = mean_distribution.mean()\n", "ax.plot([meanIncome, meanIncome], [0, 50], color=\"black\", linestyle=\"--\")\n", "ax.text(\n", " meanIncome,\n", " 10,\n", " f\"Mean: {meanIncome:.0f}\",\n", " bbox=dict(facecolor=\"white\", edgecolor=\"white\", alpha=0.5),\n", " horizontalalignment=\"center\",\n", " verticalalignment=\"center\",\n", ")\n", "ax.set_ylim(0, 80)\n", "ax.set_ylabel(\"Counts\")\n", "\n", "plt.tight_layout();" ] }, { "cell_type": "code", "execution_count": 13, "id": "2b647865-1f2c-484c-9fcb-6453e8a50355", "metadata": {}, "outputs": [], "source": [ "group1 = [19, 20, 14, 23, 15, 18]\n", "group2 = [13, 14, 8, 17, 9, 12]" ] }, { "cell_type": "code", "execution_count": 14, "id": "f6118595-e8bc-4607-bc1a-a4612d8a1f02", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10.966666666666667" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "10.966666666666667" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m1 = np.mean(group1)\n", "m2 = np.mean(group2)\n", "var1 = statistics.variance(group1, xbar=None)\n", "var2 = statistics.variance(group2, xbar=None)\n", "var1\n", "var2" ] }, { "cell_type": "code", "execution_count": 15, "id": "f6f41623-08f8-4d9a-bbc6-22e0b6928320", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.138156196894831" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(m1 - m2) / np.sqrt((var1 / 6) + (var2 / 6))" ] }, { "cell_type": "code", "execution_count": 16, "id": "226ed025-c9d2-40a7-b4cf-f6e029f338c9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Ttest_indResult(statistic=3.138156196894831, pvalue=0.010543184275035807)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats.ttest_ind(group1, group2)" ] }, { "cell_type": "code", "execution_count": 17, "id": "8df460fc-829c-4592-ad72-80b5f43d4dbf", "metadata": {}, "outputs": [], "source": [ "def read_derm_data():\n", " np.random.seed(1)\n", "\n", " # Histopathological Attributes: (values 0, 1, 2, 3)\n", " # Clinical Attributes: (values 0, 1, 2, 3, unless indicated)\n", " features = [\n", " \"erythema\",\n", " \"scaling\",\n", " \"definite borders\",\n", " \"itching\",\n", " \"koebner phenomenon\",\n", " \"polygonal papules\",\n", " \"follicular papules\",\n", " \"oral mucosal involvement\",\n", " \"knee and elbow involvement\",\n", " \"scalp involvement\",\n", " \"family history\", # 0 or 1\n", " \"melanin incontinence\",\n", " \"eosinophils in the infiltrate\",\n", " \"PNL infiltrate\",\n", " \"fibrosis of the papillary dermis\",\n", " \"exocytosis\",\n", " \"acanthosis\",\n", " \"hyperkeratosis\",\n", " \"parakeratosis\",\n", " \"clubbing of the rete ridges\",\n", " \"elongation of the rete ridges\",\n", " \"thinning of the suprapapillary epidermis\",\n", " \"spongiform pustule\",\n", " \"munro microabcess\",\n", " \"focal hypergranulosis\",\n", " \"disappearance of the granular layer\",\n", " \"vacuolisation and damage of basal layer\",\n", " \"spongiosis\",\n", " \"saw-tooth appearance of retes\",\n", " \"follicular horn plug\",\n", " \"perifollicular parakeratosis\",\n", " \"inflammatory monoluclear inflitrate\",\n", " \"band-like infiltrate\",\n", " \"Age\", # linear; missing marked '?'\n", " \"TARGET\", # See mapping\n", " ]\n", "\n", " targets = {\n", " 1: \"psoriasis\", # 112 instances\n", " 2: \"seboreic dermatitis\", # 61\n", " 3: \"lichen planus\", # 72\n", " 4: \"pityriasis rosea\", # 49\n", " 5: \"cronic dermatitis\", # 52\n", " 6: \"pityriasis rubra pilaris\", # 20\n", " }\n", "\n", " data = os.path.join(data_dir, \"dermatology.data\")\n", " df = pd.read_csv(data, header=None, names=features, na_values=[\"?\"])\n", " df[\"TARGET\"] = df.TARGET.map(targets)\n", "\n", " derm = df.copy()\n", " derm.loc[derm.Age == \"?\", \"Age\"] = None\n", " derm[\"Age\"] = derm.Age.astype(float)\n", " return derm" ] }, { "cell_type": "code", "execution_count": 18, "id": "ef10fdef-711f-4546-a9bd-d54a31366cec", "metadata": {}, "outputs": [], "source": [ "derm = read_derm_data()" ] }, { "cell_type": "code", "execution_count": 19, "id": "90fcb5e3-4e4c-466a-8f76-c9481ed04593", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
erythemascalingdefinite bordersitchingAgeTARGET
247222062.0psoriasis
127222244.0lichen planus
230320130.0seboreic dermatitis
162322222.0lichen planus
159322147.0seboreic dermatitis
296211319.0cronic dermatitis
\n", "
" ], "text/plain": [ " erythema scaling definite borders itching Age TARGET\n", "247 2 2 2 0 62.0 psoriasis\n", "127 2 2 2 2 44.0 lichen planus\n", "230 3 2 0 1 30.0 seboreic dermatitis\n", "162 3 2 2 2 22.0 lichen planus\n", "159 3 2 2 1 47.0 seboreic dermatitis\n", "296 2 1 1 3 19.0 cronic dermatitis" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use iloc on both axes and sample\n", "derm.iloc[:, [0, 1, 2, 3, -2, -1]].sample(6)" ] }, { "cell_type": "code", "execution_count": 20, "id": "43734f3f-a997-436c-93c9-04785c6a5c47", "metadata": {}, "outputs": [], "source": [ "clean, suspicious = [], {}\n", "for col in derm.columns:\n", " values = derm[col].unique()\n", " if set(values) <= {0, 1, 2, 3}:\n", " clean.append(col)\n", " else:\n", " suspicious[col] = values" ] }, { "cell_type": "code", "execution_count": 21, "id": "46b6f7d4-77a0-4e7d-b70e-039caa83568d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "No problem detected:\n" ] }, { "ename": "NameError", "evalue": "name 'pprint' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_128943/136470058.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No problem detected:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"... {len(clean) - 8} other fields\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'pprint' is not defined" ] } ], "source": [ "print(\"No problem detected:\")\n", "pprint(clean[:8])\n", "print(f\"... {len(clean) - 8} other fields\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d1180b53-e9b0-407d-bd1a-22474288e195", "metadata": {}, "outputs": [], "source": [ "# Notice age has some expected ages and also a '?'\n", "print(\"Suspicious:\")\n", "pprint(suspicious)" ] }, { "cell_type": "code", "execution_count": null, "id": "afb090dc-1a70-4aa3-945f-ac8331cc84db", "metadata": {}, "outputs": [], "source": [ "# dataset encodes missing values as ?\n", "# --> added ? to na_values list to be recognized when reading\n", "# file in panda's read_csv\n", "derm.loc[derm.Age.isnull()].iloc[:, -4:]\n", "# alternatively:\n", "# Assign missing ages marked with '?' as None\n", "# derm.loc[derm.Age == '?', 'Age'] = None # or NaN\n", "# Convert string/None ages to floating-point\n", "# derm['Age'] = derm.Age.astype(float)" ] }, { "cell_type": "code", "execution_count": null, "id": "2e990956-1d39-4894-ab5e-a3ac8712cfea", "metadata": {}, "outputs": [], "source": [ "# detailed plot\n", "# count values\n", "age_stats = pd.DataFrame(\n", " [\n", " derm[derm.Age == x][\"Age\"].count()\n", " for x in np.arange(derm[\"Age\"].min(), derm[\"Age\"].max())\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "295def47-3520-42d2-b6df-3093585e4900", "metadata": {}, "outputs": [], "source": [ "# bar plot\n", "plot = age_stats.plot(\n", " kind=\"bar\",\n", " xlabel=\"Age\",\n", " ylabel=\"Count\",\n", " legend=False,\n", ")\n", "\n", "# adapt ticks on x-axis\n", "for ind, label in enumerate(plot.get_xticklabels()):\n", " if ind % 10 == 0: # every 10th label is kept\n", " label.set_visible(True)\n", " else:\n", " label.set_visible(False)" ] }, { "cell_type": "code", "execution_count": null, "id": "86677c98-7415-4126-840c-a5bc5fffec5b", "metadata": {}, "outputs": [], "source": [ "# mean and median seem like best solution\n", "derm.Age.mean(), derm.Age.median()" ] }, { "cell_type": "code", "execution_count": null, "id": "f153af7d-d05e-4902-82be-45be358be237", "metadata": {}, "outputs": [], "source": [ "derm.Age.mode()" ] }, { "cell_type": "markdown", "id": "e9888b03-63b3-466e-b8ac-8e862a8c5bb6", "metadata": {}, "source": [ "Alternatively, this dataset was created in Turkey - we might also want to use domain knowledge and use the mean/median age in Turkey at the time of dataset creation." ] }, { "cell_type": "code", "execution_count": null, "id": "c3aa2477-b374-411a-bc44-fb84d48e9d10", "metadata": {}, "outputs": [], "source": [ "# use sklearn's SimpleImputer\n", "imputer = SimpleImputer(strategy=\"mean\")\n", "# expects 2d array or dataframe\n", "derm[\"Age_imputed\"] = imputer.fit_transform(pd.DataFrame(derm[\"Age\"]))" ] }, { "cell_type": "code", "execution_count": null, "id": "b9cbaf00-26a8-4763-af9f-d6146408ee51", "metadata": {}, "outputs": [], "source": [ "derm.loc[derm.Age.isnull()].iloc[:, -4:]" ] }, { "cell_type": "code", "execution_count": null, "id": "c289112b-0a6b-438a-98f2-9c911f1dc9ea", "metadata": {}, "outputs": [], "source": [ "# load digits dataset (adopted, now missing a few pixels)\n", "digits = np.load(os.path.join(data_dir, \"digits.npy\"))\n", "print(\"Array shape:\", digits.shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "e1526d7e-8c7b-48dd-953a-1db09d8d050d", "metadata": {}, "outputs": [], "source": [ "# display digits\n", "def show_digits(digits=digits, x=3, y=3, title=\"Digits\"):\n", " \"Display of 'corrupted numerals'\"\n", " if digits.min() >= 0:\n", " newcm = cm.get_cmap(\"Greys\", 17)\n", " else:\n", " gray = cm.get_cmap(\"Greys\", 18)\n", " newcolors = gray(np.linspace(0, 1, 18))\n", " newcolors[:1, :] = np.array([1.0, 0.9, 0.9, 1])\n", " newcm = ListedColormap(newcolors)\n", "\n", " fig, axes = plt.subplots(\n", " x,\n", " y,\n", " figsize=(x * 2.5, y * 2.5),\n", " subplot_kw={\"xticks\": (), \"yticks\": ()},\n", " )\n", "\n", " for ax, img in zip(axes.ravel(), digits):\n", " ax.imshow(img, cmap=newcm)\n", " for i in range(8):\n", " for j in range(8):\n", " if img[i, j] == -1:\n", " s = \"╳\"\n", " c = \"k\"\n", " else:\n", " s = str(img[i, j])\n", " c = \"k\" if img[i, j] < 8 else \"w\"\n", " _ = ax.text(j, i, s, color=c, ha=\"center\", va=\"center\")\n", " fig.suptitle(title, y=0)\n", " fig.tight_layout()" ] }, { "cell_type": "code", "execution_count": null, "id": "6b4cf573-8f4d-4f8e-9c38-1528a875dce2", "metadata": {}, "outputs": [], "source": [ "show_digits(digits, title=\"Digits with missing pixels\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b81a57ad-a81f-445b-83b2-d2352e614167", "metadata": {}, "outputs": [], "source": [ "# Coded for clarity, not for best vectorized speed\n", "# Function definition only; used in later cell\n", "def fill_missing(digit):\n", " digit = digit.copy()\n", " missing = np.where(digit == -1)\n", " for y, x in zip(*missing): # Pull off x/y position of pixel\n", " # Do not want negative indices in slice\n", " x_start = max(0, x - 1)\n", " y_start = max(0, y - 1)\n", " # No harm in index larger than size\n", " x_end = x + 2\n", " y_end = y + 2\n", " # What if another -1 is in region? Remove all the -1s\n", " region = digit[y_start:y_end, x_start:x_end].flatten()\n", " region = region[region >= 0]\n", " total = np.sum(region)\n", " avg = total // region.size\n", " digit[y, x] = avg\n", " return digit" ] }, { "cell_type": "code", "execution_count": null, "id": "a92431e8-ede0-4e84-ab13-3669b2f91d73", "metadata": {}, "outputs": [], "source": [ "new = np.empty_like(digits)\n", "for n in range(new.shape[0]):\n", " new[n] = fill_missing(digits[n])\n", "\n", "show_digits(digits, title=\"Digits with missing pixels\"), show_digits(\n", " new, title=\"Digits with imputed pixels\"\n", ")" ] }, { "cell_type": "markdown", "id": "6d0aa854-2346-468c-b869-3cb615e5d6d0", "metadata": {}, "source": [ "The following initial examples are based on the iris datasets (directly loaded via scikit-learn). The dataset consists of 50 samples from three species of the iris flower and describes its sepals (Kelchblatt) and petals (Blütenblatt) (length and width). More information on the dataset can be found here: https://en.wikipedia.org/wiki/Iris_flower_data_set." ] }, { "cell_type": "markdown", "id": "e30aed5e-b026-44ce-9da4-ae698e02dfa9", "metadata": {}, "source": [ "Note that we could also load the iris dataset from scikit learn via the `load_iris` method and then convert it to a dataframe and reset the column names." ] }, { "cell_type": "code", "execution_count": null, "id": "67acad73-88ee-445f-b303-af99f37af6c0", "metadata": {}, "outputs": [], "source": [ "# load iris dataset from seaborne\n", "iris = sns.load_dataset(\"iris\")\n", "iris" ] }, { "cell_type": "code", "execution_count": null, "id": "935e8acd-f27c-4cec-829f-36c0240da5d4", "metadata": {}, "outputs": [], "source": [ "# first look at data\n", "sns.pairplot(iris, hue=\"species\", palette=\"viridis\");" ] }, { "cell_type": "code", "execution_count": null, "id": "8c6e7a68-b433-4399-9c76-0507730015b5", "metadata": {}, "outputs": [], "source": [ "iris.cov()" ] }, { "cell_type": "code", "execution_count": null, "id": "b6ba2df8-f27c-40eb-909b-9d27c628f3a6", "metadata": {}, "outputs": [], "source": [ "iris.corr()" ] }, { "cell_type": "code", "execution_count": null, "id": "1a6df280-a126-41ee-8c5b-2f299471109f", "metadata": {}, "outputs": [], "source": [ "def iris_preprocessing(iris_data):\n", " # encode species label as numeric value\n", " label_encoder = preprocessing.LabelEncoder()\n", " iris_data[\"species_id\"] = label_encoder.fit_transform(iris.species)\n", "\n", " # scale values\n", " scaler = preprocessing.MinMaxScaler()\n", " iris_data[\n", " [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", " ] = scaler.fit_transform(\n", " iris_data[\n", " [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", " ]\n", " )\n", " return iris_data" ] }, { "cell_type": "code", "execution_count": null, "id": "e3222fe3-9f71-4fb9-a8a3-ef9dad329859", "metadata": {}, "outputs": [], "source": [ "# preprocessing\n", "iris = sns.load_dataset(\"iris\")\n", "iris = iris_preprocessing(iris)\n", "iris[:10]" ] }, { "cell_type": "code", "execution_count": null, "id": "50e46151-194a-4ec9-b2cb-355e8c668b9d", "metadata": {}, "outputs": [], "source": [ "ax = iris.plot.scatter(\n", " x=\"petal_length\", y=\"petal_width\", c=\"species_id\", colormap=\"viridis\"\n", ");" ] }, { "cell_type": "code", "execution_count": null, "id": "fc13815a-80e4-48bd-813b-8a6129bd7dde", "metadata": {}, "outputs": [], "source": [ "# apply PCA, reduce to a single dimension\n", "petals = iris[[\"petal_length\", \"petal_width\"]]\n", "pca = PCA(n_components=1)\n", "pca.fit(petals)\n", "petals_1d = pca.transform(petals)" ] }, { "cell_type": "code", "execution_count": null, "id": "9892b191-3eee-4d22-9d59-0be19aee77aa", "metadata": {}, "outputs": [], "source": [ "petals_1d[:15]" ] }, { "cell_type": "code", "execution_count": null, "id": "567712a3-2554-4822-af66-01c2e0578d69", "metadata": {}, "outputs": [], "source": [ "# actual components of the PCA (principal axes)\n", "pca.components_" ] }, { "cell_type": "code", "execution_count": null, "id": "fe92f4e0-35ec-4970-bd29-92df0cb81616", "metadata": {}, "outputs": [], "source": [ "pca.explained_variance_ratio_" ] }, { "cell_type": "code", "execution_count": null, "id": "56c720f9-a073-4eb1-ac33-6b29e0e4a918", "metadata": {}, "outputs": [], "source": [ "petals_inverse = pca.inverse_transform(petals_1d)" ] }, { "cell_type": "code", "execution_count": null, "id": "07377fe7-7eb8-45b5-8d4a-33e44b9d3309", "metadata": {}, "outputs": [], "source": [ "petals" ] }, { "cell_type": "code", "execution_count": null, "id": "b386e633-ccc3-43fd-bc81-98a4a5c83388", "metadata": {}, "outputs": [], "source": [ "# visualize sepals\n", "plt.scatter(\n", " petals[\"petal_length\"], petals[\"petal_width\"], s=50, label=\"2d data\"\n", ")\n", "plt.scatter(petals_inverse[:, 0], petals_inverse[:, 1], label=\"1d data\")\n", "plt.xlabel(\"Petal length\")\n", "plt.ylabel(\"Petal width\")\n", "plt.legend();" ] }, { "cell_type": "code", "execution_count": null, "id": "c13bd66e-d7cc-442d-8163-c30645f1bfb2", "metadata": {}, "outputs": [], "source": [ "plt.scatter(petals_1d, y=np.zeros(len(petals_1d)))\n", "plt.xlabel(\"PC1\")" ] }, { "cell_type": "markdown", "id": "24c18f82-a823-413d-8a5f-53365abc8672", "metadata": {}, "source": [ "We will continue to use the iris dataset, but now use all four features." ] }, { "cell_type": "markdown", "id": "cff7ae65-f278-4d9d-8916-95c432eb1471", "metadata": {}, "source": [ "
\n", "Note: The follow code is based on plot.ly. To show plot.ly charts in this notebook in jupyer lab, we ned to install an extension: `jupyter labextension install @jupyter-widgets/jupyterlab-manager jupyterlab-plotly`
" ] }, { "cell_type": "code", "execution_count": null, "id": "3eb30d87-b7d4-404d-8f05-95f1a4b8020d", "metadata": {}, "outputs": [], "source": [ "# load iris dataset from seaborne\n", "iris = sns.load_dataset(\"iris\")" ] }, { "cell_type": "code", "execution_count": null, "id": "bbc52ea1-a06d-41d1-a772-7b39f6cdbac5", "metadata": {}, "outputs": [], "source": [ "features = [\"sepal_width\", \"sepal_length\", \"petal_width\", \"petal_length\"]\n", "fig = px.scatter_matrix(\n", " iris, dimensions=features, color=\"species\", width=1000, height=800\n", ")\n", "fig.update_traces(diagonal_visible=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "1e37cb24-87df-42e4-a05b-b52c7ffeb6e3", "metadata": {}, "outputs": [], "source": [ "# preprocessing\n", "iris = iris_preprocessing(iris)" ] }, { "cell_type": "code", "execution_count": null, "id": "a53b42e1-7250-44dc-92e8-da68b009b71b", "metadata": {}, "outputs": [], "source": [ "pca = PCA(n_components=3)\n", "pcs_3d = pca.fit_transform(iris[features])" ] }, { "cell_type": "code", "execution_count": null, "id": "5387887c-cf8e-4dbd-9766-edf0b082b8ac", "metadata": {}, "outputs": [], "source": [ "# this code is partially adapted from https://plotly.com/python/pca-visualization/\n", "fig = px.scatter_3d(\n", " pcs_3d,\n", " x=0,\n", " y=1,\n", " z=2,\n", " color=iris[\"species\"],\n", " labels={\"0\": \"PC 1\", \"1\": \"PC 2\", \"2\": \"PC 3\"},\n", " width=900,\n", " height=900,\n", ")\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "61e2cde0-c177-466b-a1db-310d56b5b2b3", "metadata": {}, "outputs": [], "source": [ "# look at explained variance contribution of each PC\n", "explained_variance_stats = np.cumsum(pca.explained_variance_ratio_)\n", "\n", "px.area(\n", " x=range(1, explained_variance_stats.shape[0] + 1),\n", " y=explained_variance_stats,\n", " labels={\"x\": \"# Components\", \"y\": \"Explained Variance\"},\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "88d683cd-762d-4c06-9064-7e8d317f6a72", "metadata": {}, "outputs": [], "source": [ "# plot as elbow curve\n", "number_pcs = np.arange(pca.n_components_) + 1\n", "plt.plot(number_pcs, pca.explained_variance_ratio_)\n", "plt.title(\"Elbow Plot\")\n", "plt.xlabel(\"PC\")\n", "plt.ylabel(\"Variance Explained (%)\")\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "43bdc113-6342-433a-bcfb-a5f84c70df7d", "metadata": {}, "source": [ "Using PCA, we reduce the number of dimensions to 2, essentially projecting the points onto a 2d plane/surface. This is easily observable when printing it in 3D." ] }, { "cell_type": "code", "execution_count": null, "id": "1100ee6e-950b-411b-a496-5ff728ac131f", "metadata": {}, "outputs": [], "source": [ "pca = PCA(n_components=2)\n", "pcs_2d = pca.fit_transform(iris[features])" ] }, { "cell_type": "code", "execution_count": null, "id": "3e078204-145c-45a2-b1f3-0bf2b09a0a09", "metadata": {}, "outputs": [], "source": [ "fig = px.scatter_3d(\n", " pcs_2d,\n", " x=0,\n", " y=1,\n", " z=np.zeros(len(iris)),\n", " color=iris[\"species\"],\n", " labels={\"0\": \"PC 1\", \"1\": \"PC 2\", \"2\": \"PC 3\"},\n", " width=900,\n", " height=900,\n", ")\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "8e203fc7-c89b-43ef-ab43-21aafffe00b3", "metadata": {}, "outputs": [], "source": [ "# plot 2d plot\n", "# include principal components\n", "\n", "pca = PCA(n_components=2)\n", "components = pca.fit_transform(iris[features])\n", "# loadings = actual weights for the linear combination for the projection\n", "loadings = pca.components_.T * np.sqrt(pca.explained_variance_)\n", "\n", "fig = px.scatter(\n", " components,\n", " x=0,\n", " y=1,\n", " color=iris[\"species\"],\n", " labels={\"0\": \"PC1\", \"1\": \"PC2\"},\n", ")\n", "\n", "for i, feature in enumerate(features):\n", " fig.add_shape(\n", " type=\"line\", x0=0, y0=0, x1=loadings[i, 0], y1=loadings[i, 1]\n", " )\n", " fig.add_annotation(\n", " x=loadings[i, 0],\n", " y=loadings[i, 1],\n", " ax=0,\n", " ay=0,\n", " xanchor=\"center\",\n", " yanchor=\"bottom\",\n", " text=feature,\n", " )\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "e6b6801f-54e1-4453-9bf0-7755f8476d82", "metadata": {}, "outputs": [], "source": [ "pca.components_" ] }, { "cell_type": "code", "execution_count": null, "id": "d0df46b2-52a3-4e67-9665-93391f64ef57", "metadata": {}, "outputs": [], "source": [ "pca.explained_variance_ratio_" ] }, { "cell_type": "markdown", "id": "9aeca1cd-aa62-4e41-a48e-c7200d21e2fb", "metadata": {}, "source": [ "In the following, we will perform PCA step-by-step, again based on the iris dataset." ] }, { "cell_type": "code", "execution_count": null, "id": "f4ed46dc-2140-4e84-a764-c9c2276c9f1b", "metadata": {}, "outputs": [], "source": [ "# load iris dataset from seaborne, preprocessing\n", "# preprocessing includes centering\n", "iris = sns.load_dataset(\"iris\")\n", "features = [\"sepal_width\", \"sepal_length\", \"petal_width\", \"petal_length\"]\n", "# for reasons of simplicity, reduce pandas df to simple numerical matrix\n", "iris = iris[features].values" ] }, { "cell_type": "code", "execution_count": null, "id": "d6b51565-4e68-48da-ad86-9d11a1c24fa4", "metadata": {}, "outputs": [], "source": [ "# compute covariance matrix on centered data\n", "mean = np.mean(iris, axis=0)\n", "iris_centered = iris - mean\n", "covariance_matrix = np.cov(iris_centered.T)\n", "covariance_matrix" ] }, { "cell_type": "code", "execution_count": null, "id": "2a7c0f1b-998c-44ab-bd98-e736aa0beb55", "metadata": {}, "outputs": [], "source": [ "# perform eigendecomposition on covariance matrix\n", "eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)" ] }, { "cell_type": "code", "execution_count": null, "id": "dd2013cb-78ca-4d39-a686-1873900ee04f", "metadata": {}, "outputs": [], "source": [ "eigenvalues" ] }, { "cell_type": "code", "execution_count": null, "id": "a10a7db0-6061-4650-b79a-f460b7539e84", "metadata": {}, "outputs": [], "source": [ "eigenvectors" ] }, { "cell_type": "code", "execution_count": null, "id": "c10dbf3f-8025-433f-a832-77a62f4f7200", "metadata": {}, "outputs": [], "source": [ "# np.linalg.eig does not necessarily return eigenvalues and vectors sorted\n", "# by Eigenvalue --> sort both vectors analoguously\n", "sorted_index = np.argsort(eigenvalues)[::-1]\n", "eigenvalues_sorted = eigenvalues[sorted_index]\n", "eigenvectors_sorted = eigenvectors[:, sorted_index]\n", "eigenvalues_sorted" ] }, { "cell_type": "code", "execution_count": null, "id": "ed3f3d67-71d4-4ac9-b241-2a10c47df74c", "metadata": {}, "outputs": [], "source": [ "# from the eigenvalues, we can already compute the contribution to\n", "# explained variance for each PC\n", "sum_eigenvalues = np.sum(eigenvalues_sorted)\n", "for i in range(len(eigenvalues_sorted)):\n", " print(f\"PC{i}: {(eigenvalues_sorted[i] / sum_eigenvalues).round(2)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b7e557a9-50d7-45ca-9f48-f061c4295b13", "metadata": {}, "outputs": [], "source": [ "# we choose to use 2 components\n", "eigenvectors = eigenvectors_sorted[:, 0:2]\n", "eigenvectors" ] }, { "cell_type": "code", "execution_count": null, "id": "d64bed45-4a2c-4c29-9bf6-c09be1c54ec8", "metadata": {}, "outputs": [], "source": [ "# apply eigenvectors to original data to reduce dimensions\n", "iris_2d = np.dot(\n", " eigenvectors.transpose(), iris_centered.transpose()\n", ").transpose()" ] }, { "cell_type": "code", "execution_count": null, "id": "90303ea2-a2d7-4e1a-b520-7af82f923419", "metadata": {}, "outputs": [], "source": [ "# visualize results\n", "fig = px.scatter(\n", " iris_2d,\n", " x=0,\n", " y=1,\n", " # color=iris[\"species\"],\n", " labels={\"0\": \"PC1\", \"1\": \"PC2\"},\n", ")\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "1db78571-2cd2-4e45-998a-3f2e9bcc1928", "metadata": {}, "outputs": [], "source": [ "# principal components are orthogonal\n", "np.matrix.round(np.cov(eigenvectors[0], eigenvectors[1]))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }