Commit 4f713afc authored by Eva Zangerle's avatar Eva Zangerle
Browse files

merge 07

parent 80411ff8
......@@ -33,7 +33,7 @@
"metadata": {},
"outputs": [],
"source": [
"data_dir='../data'"
"data_dir = \"../data\""
]
},
{
......@@ -98,26 +98,26 @@
" \"inflammatory monoluclear inflitrate\",\n",
" \"band-like infiltrate\",\n",
" \"Age\", # linear; missing marked '?'\n",
" \"TARGET\" # See mapping\n",
" \"TARGET\", # See mapping\n",
"]\n",
"\n",
"targets = {\n",
" 1:\"psoriasis\", # 112 instances\n",
" 2:\"seboreic dermatitis\", # 61\n",
" 3:\"lichen planus\", # 72\n",
" 4:\"pityriasis rosea\", # 49\n",
" 5:\"cronic dermatitis\", # 52\n",
" 6:\"pityriasis rubra pilaris\", # 20\n",
" 1: \"psoriasis\", # 112 instances\n",
" 2: \"seboreic dermatitis\", # 61\n",
" 3: \"lichen planus\", # 72\n",
" 4: \"pityriasis rosea\", # 49\n",
" 5: \"cronic dermatitis\", # 52\n",
" 6: \"pityriasis rubra pilaris\", # 20\n",
"}\n",
"\n",
"data = os.path.join(data_dir, 'dermatology.data')\n",
"metadata = os.path.join(data_dir, 'dermatology.names')\n",
"df = pd.read_csv(data, header=None, names=features, na_values=['?'])\n",
"df['TARGET'] = df.TARGET.map(targets)\n",
"data = os.path.join(data_dir, \"dermatology.data\")\n",
"metadata = os.path.join(data_dir, \"dermatology.names\")\n",
"df = pd.read_csv(data, header=None, names=features, na_values=[\"?\"])\n",
"df[\"TARGET\"] = df.TARGET.map(targets)\n",
"\n",
"derm = df.copy()\n",
"derm.loc[derm.Age == '?', 'Age'] = None\n",
"derm['Age'] = derm.Age.astype(float)"
"derm.loc[derm.Age == \"?\", \"Age\"] = None\n",
"derm[\"Age\"] = derm.Age.astype(float)"
]
},
{
......@@ -432,7 +432,7 @@
"# dataset encodes missing values as ?\n",
"# --> added ? to na_values list to be recognized when reading file in panda's read_csv\n",
"df.loc[df.Age.isnull()].iloc[:, -4:]\n",
"# alternatively: \n",
"# alternatively:\n",
"# Assign missing ages marked with '?' as None\n",
"# df.loc[df.Age == '?', 'Age'] = None # or NaN\n",
"# Convert string/None ages to floating-point\n",
......@@ -478,13 +478,14 @@
"# example plot taken from CleanData\n",
"# compare to df.Age.mode() with xticks activated\n",
"# problem?\n",
"(df.Age\n",
" .value_counts()\n",
"(\n",
" df.Age.value_counts()\n",
" .sort_index()\n",
" .plot(kind=\"bar\", \n",
" xticks= [],\n",
" #xticks = np.arange(df['Age'].min(), df['Age'].max(), 10),\n",
" yticks=[], \n",
" .plot(\n",
" kind=\"bar\",\n",
" xticks=[],\n",
" # xticks = np.arange(df['Age'].min(), df['Age'].max(), 10),\n",
" yticks=[],\n",
" title=\"Age distribution of patients \"\n",
" f\"({df.Age.min():.0f} to {df.Age.max():.0f})\",\n",
" )\n",
......@@ -520,12 +521,19 @@
],
"source": [
"# improved distribution setup\n",
"distribution = pd.DataFrame([df[df.Age == x]['Age'].count() for x in np.arange(df['Age'].min(), df['Age'].max())])\n",
"distribution.plot(kind=\"bar\",\n",
" xlabel='Age',\n",
" ylabel='Count',\n",
" xticks = np.arange(df['Age'].min(), df['Age'].max(), 10),\n",
" legend=False)"
"distribution = pd.DataFrame(\n",
" [\n",
" df[df.Age == x][\"Age\"].count()\n",
" for x in np.arange(df[\"Age\"].min(), df[\"Age\"].max())\n",
" ]\n",
")\n",
"distribution.plot(\n",
" kind=\"bar\",\n",
" xlabel=\"Age\",\n",
" ylabel=\"Count\",\n",
" xticks=np.arange(df[\"Age\"].min(), df[\"Age\"].max(), 10),\n",
" legend=False,\n",
")"
]
},
{
......@@ -588,7 +596,7 @@
"metadata": {},
"outputs": [],
"source": [
"#hmean((df.loc[df['Age'].notna()]['Age']).values)"
"# hmean((df.loc[df['Age'].notna()]['Age']).values)"
]
},
{
......@@ -615,7 +623,7 @@
],
"source": [
"# load digits dataset (adopted, now missing a few pixels)\n",
"digits = np.load( os.path.join(data_dir, 'digits.npy'))\n",
"digits = np.load(os.path.join(data_dir, \"digits.npy\"))\n",
"print(\"Array shape:\", digits.shape)"
]
},
......@@ -630,16 +638,17 @@
"def show_digits(digits=digits, x=3, y=3, title=\"Digits\"):\n",
" \"Display of 'corrupted numerals'\"\n",
" if digits.min() >= 0:\n",
" newcm = cm.get_cmap('Greys', 17)\n",
" newcm = cm.get_cmap(\"Greys\", 17)\n",
" else:\n",
" gray = cm.get_cmap('Greys', 18)\n",
" gray = cm.get_cmap(\"Greys\", 18)\n",
" newcolors = gray(np.linspace(0, 1, 18))\n",
" newcolors[:1, :] = np.array([1.0, 0.9, 0.9, 1])\n",
" newcm = ListedColormap(newcolors)\n",
"\n",
" fig, axes = plt.subplots(x, y, figsize=(x*2.5, y*2.5),\n",
" subplot_kw={'xticks':(), 'yticks': ()})\n",
" \n",
" fig, axes = plt.subplots(\n",
" x, y, figsize=(x * 2.5, y * 2.5), subplot_kw={\"xticks\": (), \"yticks\": ()}\n",
" )\n",
"\n",
" for ax, img in zip(axes.ravel(), digits):\n",
" ax.imshow(img, cmap=newcm)\n",
" for i in range(8):\n",
......@@ -650,10 +659,9 @@
" else:\n",
" s = str(img[i, j])\n",
" c = \"k\" if img[i, j] < 8 else \"w\"\n",
" text = ax.text(j, i, s, color=c,\n",
" ha=\"center\", va=\"center\")\n",
" text = ax.text(j, i, s, color=c, ha=\"center\", va=\"center\")\n",
" fig.suptitle(title, y=0)\n",
" fig.tight_layout() "
" fig.tight_layout()"
]
},
{
......@@ -691,15 +699,15 @@
" missing = np.where(digit == -1)\n",
" for y, x in zip(*missing): # Pull off x/y position of pixel\n",
" # Do not want negative indices in slice\n",
" x_start = max(0, x-1)\n",
" y_start = max(0, y-1)\n",
" x_start = max(0, x - 1)\n",
" y_start = max(0, y - 1)\n",
" # No harm in index larger than size\n",
" x_end = x+2\n",
" y_end = y+2\n",
" x_end = x + 2\n",
" y_end = y + 2\n",
" # What if another -1 is in region? Remove all the -1s\n",
" region = digit[y_start:y_end, x_start:x_end].flatten()\n",
" region = region[region >=0]\n",
" total = np.sum(region) \n",
" region = region[region >= 0]\n",
" total = np.sum(region)\n",
" avg = total // region.size\n",
" digit[y, x] = avg\n",
" return digit"
......@@ -747,7 +755,9 @@
"for n in range(new.shape[0]):\n",
" new[n] = fill_missing(digits[n])\n",
"\n",
"show_digits(digits, title=\"Digits with missing pixels\"),show_digits(new, title=\"Digits with imputed pixels\")"
"show_digits(digits, title=\"Digits with missing pixels\"), show_digits(\n",
" new, title=\"Digits with imputed pixels\"\n",
")"
]
},
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment