{
"cells": [
{
"cell_type": "markdown",
"id": "edd718da-1295-49c4-b556-3cc7b718f93c",
"metadata": {
"tags": []
},
"source": [
"# Hypothesis Testing\n",
"Lecture Data Engineering and Analytics
\n",
"Eva Zangerle"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5b126eda-5b79-4531-b8ea-72898d09dc6d",
"metadata": {},
"outputs": [],
"source": [
"# import required packages\n",
"import os\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"from sklearn.utils import resample"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5406f6f3-1c06-4f3b-aaaf-9ac6f2967729",
"metadata": {},
"outputs": [],
"source": [
"data_dir = \"../data\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "29a380d3-d574-47d1-8649-98bbd9c58edf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Author: Eva Zangerle\n",
"\n",
"Last updated: 2021-12-07 15:52:08\n",
"\n"
]
}
],
"source": [
"%load_ext watermark\n",
"%watermark -a \"Eva Zangerle\" -u -d -t"
]
},
{
"cell_type": "markdown",
"id": "c4abdb51-195d-40d4-a46e-acacdd63e517",
"metadata": {},
"source": [
"## Central Limit Theorem"
]
},
{
"cell_type": "markdown",
"id": "33556303-a6c5-4637-aa75-c319ccc1aaba",
"metadata": {},
"source": [
"In the following, we will investigate the central limit theorem. Code is adopted from (PracticalStatistics) https://github.com/gedeck/practical-statistics-for-data-scientists/. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "656050b5-45a5-4813-a228-e4b9ba03a111",
"metadata": {},
"outputs": [],
"source": [
"loans_income = pd.read_csv(\n",
" os.path.join(data_dir, \"loans_income.csv\"), squeeze=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c0ef827e-8e5e-412f-a6ee-fe57f183d47e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 67000\n",
"1 52000\n",
"2 100000\n",
"3 78762\n",
"4 37041\n",
" ... \n",
"49995 40000\n",
"49996 54000\n",
"49997 50000\n",
"49998 82000\n",
"49999 70000\n",
"Name: x, Length: 50000, dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"loans_income"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "668ab92d-218b-45bd-b3dc-ca4dbad4013e",
"metadata": {},
"outputs": [],
"source": [
"# sample 1000 rows\n",
"sample_data = pd.DataFrame(\n",
" {\n",
" \"income\": loans_income.sample(1000),\n",
" \"type\": \"Data\",\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f65ee2f1-c5bb-4146-9645-711fa3a2dce6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n", " | income | \n", "type | \n", "
---|---|---|
48305 | \n", "70000.00 | \n", "Data | \n", "
10739 | \n", "88224.00 | \n", "Data | \n", "
977 | \n", "39500.00 | \n", "Data | \n", "
24707 | \n", "65000.00 | \n", "Data | \n", "
46225 | \n", "75000.00 | \n", "Data | \n", "
... | \n", "... | \n", "... | \n", "
4995 | \n", "73199.60 | \n", "Mean of 20, 5000 samples | \n", "
4996 | \n", "62895.20 | \n", "Mean of 20, 5000 samples | \n", "
4997 | \n", "58149.70 | \n", "Mean of 20, 5000 samples | \n", "
4998 | \n", "67940.85 | \n", "Mean of 20, 5000 samples | \n", "
4999 | \n", "78757.00 | \n", "Mean of 20, 5000 samples | \n", "
9000 rows × 2 columns
\n", "