{
"cells": [
{
"cell_type": "markdown",
"id": "edd718da-1295-49c4-b556-3cc7b718f93c",
"metadata": {},
"source": [
"# Hypothesis Testing\n",
"Lecture Data Engineering and Analytics
\n",
"Eva Zangerle"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "5b126eda-5b79-4531-b8ea-72898d09dc6d",
"metadata": {},
"outputs": [],
"source": [
"# import required packages\n",
"import os\n",
"import statistics\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy import stats\n",
"from sklearn.utils import resample"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5406f6f3-1c06-4f3b-aaaf-9ac6f2967729",
"metadata": {},
"outputs": [],
"source": [
"data_dir = \"../data\""
]
},
{
"cell_type": "markdown",
"id": "c4abdb51-195d-40d4-a46e-acacdd63e517",
"metadata": {},
"source": [
"## Central Limit Theorem"
]
},
{
"cell_type": "markdown",
"id": "33556303-a6c5-4637-aa75-c319ccc1aaba",
"metadata": {},
"source": [
"In the following, we will investigate the central limit theorem. Code is adopted from (PracticalStatistics) https://github.com/gedeck/practical-statistics-for-data-scientists/. "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "656050b5-45a5-4813-a228-e4b9ba03a111",
"metadata": {},
"outputs": [],
"source": [
"loans_income = pd.read_csv(\n",
" os.path.join(data_dir, \"loans_income.csv\"), squeeze=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c0ef827e-8e5e-412f-a6ee-fe57f183d47e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 67000\n",
"1 52000\n",
"2 100000\n",
"3 78762\n",
"4 37041\n",
" ... \n",
"49995 40000\n",
"49996 54000\n",
"49997 50000\n",
"49998 82000\n",
"49999 70000\n",
"Name: x, Length: 50000, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"loans_income"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "668ab92d-218b-45bd-b3dc-ca4dbad4013e",
"metadata": {},
"outputs": [],
"source": [
"# sample 1000 rows\n",
"sample_data = pd.DataFrame(\n",
" {\n",
" \"income\": loans_income.sample(1000),\n",
" \"type\": \"Data\",\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f65ee2f1-c5bb-4146-9645-711fa3a2dce6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" income type\n",
"19678 48300.0 Data\n",
"5695 40000.0 Data\n",
"13497 60000.0 Data\n",
"6298 30000.0 Data\n",
"1687 75000.0 Data\n"
]
},
{
"data": {
"text/html": [
"
\n", " | income | \n", "type | \n", "
---|---|---|
19678 | \n", "48300.00 | \n", "Data | \n", "
5695 | \n", "40000.00 | \n", "Data | \n", "
13497 | \n", "60000.00 | \n", "Data | \n", "
6298 | \n", "30000.00 | \n", "Data | \n", "
1687 | \n", "75000.00 | \n", "Data | \n", "
... | \n", "... | \n", "... | \n", "
4995 | \n", "78028.85 | \n", "Mean of 20, 5000 samples | \n", "
4996 | \n", "82030.60 | \n", "Mean of 20, 5000 samples | \n", "
4997 | \n", "75391.50 | \n", "Mean of 20, 5000 samples | \n", "
4998 | \n", "56003.35 | \n", "Mean of 20, 5000 samples | \n", "
4999 | \n", "65848.20 | \n", "Mean of 20, 5000 samples | \n", "
9000 rows × 2 columns
\n", "\n", " | erythema | \n", "scaling | \n", "definite borders | \n", "itching | \n", "Age | \n", "TARGET | \n", "
---|---|---|---|---|---|---|
247 | \n", "2 | \n", "2 | \n", "2 | \n", "0 | \n", "62.0 | \n", "psoriasis | \n", "
127 | \n", "2 | \n", "2 | \n", "2 | \n", "2 | \n", "44.0 | \n", "lichen planus | \n", "
230 | \n", "3 | \n", "2 | \n", "0 | \n", "1 | \n", "30.0 | \n", "seboreic dermatitis | \n", "
162 | \n", "3 | \n", "2 | \n", "2 | \n", "2 | \n", "22.0 | \n", "lichen planus | \n", "
159 | \n", "3 | \n", "2 | \n", "2 | \n", "1 | \n", "47.0 | \n", "seboreic dermatitis | \n", "
296 | \n", "2 | \n", "1 | \n", "1 | \n", "3 | \n", "19.0 | \n", "cronic dermatitis | \n", "