Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Institut für Informatik
dbis
dbis-teaching
data-engineering-analytics-notebooks
Commits
4f713afc
Commit
4f713afc
authored
Nov 02, 2021
by
Eva Zangerle
Browse files
merge 07
parent
80411ff8
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
48 deletions
+58
-48
notebooks/07_feature_engineering.ipynb
notebooks/07_feature_engineering.ipynb
+58
-48
No files found.
notebooks/07_feature_engineering.ipynb
View file @
4f713afc
...
...
@@ -33,7 +33,7 @@
"metadata": {},
"outputs": [],
"source": [
"data_dir
='
../data
'
"
"data_dir
= \"
../data
\"
"
]
},
{
...
...
@@ -98,26 +98,26 @@
" \"inflammatory monoluclear inflitrate\",\n",
" \"band-like infiltrate\",\n",
" \"Age\", # linear; missing marked '?'\n",
" \"TARGET\" # See mapping\n",
" \"TARGET\"
,
# See mapping\n",
"]\n",
"\n",
"targets = {\n",
" 1:\"psoriasis\",
# 112 instances\n",
" 2:\"seboreic dermatitis\",
# 61\n",
" 3:\"lichen planus\",
# 72\n",
" 4:\"pityriasis rosea\",
# 49\n",
" 5:\"cronic dermatitis\",
# 52\n",
" 6:\"pityriasis rubra pilaris\", # 20\n",
" 1:
\"psoriasis\", # 112 instances\n",
" 2:
\"seboreic dermatitis\", # 61\n",
" 3:
\"lichen planus\", # 72\n",
" 4:
\"pityriasis rosea\", # 49\n",
" 5:
\"cronic dermatitis\", # 52\n",
" 6:
\"pityriasis rubra pilaris\", # 20\n",
"}\n",
"\n",
"data = os.path.join(data_dir,
'
dermatology.data
'
)\n",
"metadata = os.path.join(data_dir,
'
dermatology.names
'
)\n",
"df = pd.read_csv(data, header=None, names=features, na_values=[
'?'
])\n",
"df[
'
TARGET
'
] = df.TARGET.map(targets)\n",
"data = os.path.join(data_dir,
\"
dermatology.data
\"
)\n",
"metadata = os.path.join(data_dir,
\"
dermatology.names
\"
)\n",
"df = pd.read_csv(data, header=None, names=features, na_values=[
\"?\"
])\n",
"df[
\"
TARGET
\"
] = df.TARGET.map(targets)\n",
"\n",
"derm = df.copy()\n",
"derm.loc[derm.Age ==
'?', '
Age
'
] = None\n",
"derm[
'
Age
'
] = derm.Age.astype(float)"
"derm.loc[derm.Age ==
\"?\", \"
Age
\"
] = None\n",
"derm[
\"
Age
\"
] = derm.Age.astype(float)"
]
},
{
...
...
@@ -432,7 +432,7 @@
"# dataset encodes missing values as ?\n",
"# --> added ? to na_values list to be recognized when reading file in panda's read_csv\n",
"df.loc[df.Age.isnull()].iloc[:, -4:]\n",
"# alternatively:
\n",
"# alternatively:\n",
"# Assign missing ages marked with '?' as None\n",
"# df.loc[df.Age == '?', 'Age'] = None # or NaN\n",
"# Convert string/None ages to floating-point\n",
...
...
@@ -478,16 +478,17 @@
"# example plot taken from CleanData\n",
"# compare to df.Age.mode() with xticks activated\n",
"# problem?\n",
"(df.Age\n",
" .value_counts()\n",
" .sort_index()\n",
" .plot(kind=\"bar\", \n",
" xticks= [],\n",
" #xticks = np.arange(df['Age'].min(), df['Age'].max(), 10),\n",
" yticks=[], \n",
" title=\"Age distribution of patients \"\n",
" f\"({df.Age.min():.0f} to {df.Age.max():.0f})\",\n",
" )\n",
"(\n",
" df.Age.value_counts()\n",
" .sort_index()\n",
" .plot(\n",
" kind=\"bar\",\n",
" xticks=[],\n",
" # xticks = np.arange(df['Age'].min(), df['Age'].max(), 10),\n",
" yticks=[],\n",
" title=\"Age distribution of patients \"\n",
" f\"({df.Age.min():.0f} to {df.Age.max():.0f})\",\n",
" )\n",
")"
]
},
...
...
@@ -520,12 +521,19 @@
],
"source": [
"# improved distribution setup\n",
"distribution = pd.DataFrame([df[df.Age == x]['Age'].count() for x in np.arange(df['Age'].min(), df['Age'].max())])\n",
"distribution.plot(kind=\"bar\",\n",
" xlabel='Age',\n",
" ylabel='Count',\n",
" xticks = np.arange(df['Age'].min(), df['Age'].max(), 10),\n",
" legend=False)"
"distribution = pd.DataFrame(\n",
" [\n",
" df[df.Age == x][\"Age\"].count()\n",
" for x in np.arange(df[\"Age\"].min(), df[\"Age\"].max())\n",
" ]\n",
")\n",
"distribution.plot(\n",
" kind=\"bar\",\n",
" xlabel=\"Age\",\n",
" ylabel=\"Count\",\n",
" xticks=np.arange(df[\"Age\"].min(), df[\"Age\"].max(), 10),\n",
" legend=False,\n",
")"
]
},
{
...
...
@@ -588,7 +596,7 @@
"metadata": {},
"outputs": [],
"source": [
"#hmean((df.loc[df['Age'].notna()]['Age']).values)"
"#
hmean((df.loc[df['Age'].notna()]['Age']).values)"
]
},
{
...
...
@@ -615,7 +623,7 @@
],
"source": [
"# load digits dataset (adopted, now missing a few pixels)\n",
"digits = np.load(
os.path.join(data_dir,
'
digits.npy
'
))\n",
"digits = np.load(os.path.join(data_dir,
\"
digits.npy
\"
))\n",
"print(\"Array shape:\", digits.shape)"
]
},
...
...
@@ -630,16 +638,17 @@
"def show_digits(digits=digits, x=3, y=3, title=\"Digits\"):\n",
" \"Display of 'corrupted numerals'\"\n",
" if digits.min() >= 0:\n",
" newcm = cm.get_cmap(
'
Greys
'
, 17)\n",
" newcm = cm.get_cmap(
\"
Greys
\"
, 17)\n",
" else:\n",
" gray = cm.get_cmap(
'
Greys
'
, 18)\n",
" gray = cm.get_cmap(
\"
Greys
\"
, 18)\n",
" newcolors = gray(np.linspace(0, 1, 18))\n",
" newcolors[:1, :] = np.array([1.0, 0.9, 0.9, 1])\n",
" newcm = ListedColormap(newcolors)\n",
"\n",
" fig, axes = plt.subplots(x, y, figsize=(x*2.5, y*2.5),\n",
" subplot_kw={'xticks':(), 'yticks': ()})\n",
" \n",
" fig, axes = plt.subplots(\n",
" x, y, figsize=(x * 2.5, y * 2.5), subplot_kw={\"xticks\": (), \"yticks\": ()}\n",
" )\n",
"\n",
" for ax, img in zip(axes.ravel(), digits):\n",
" ax.imshow(img, cmap=newcm)\n",
" for i in range(8):\n",
...
...
@@ -650,10 +659,9 @@
" else:\n",
" s = str(img[i, j])\n",
" c = \"k\" if img[i, j] < 8 else \"w\"\n",
" text = ax.text(j, i, s, color=c,\n",
" ha=\"center\", va=\"center\")\n",
" text = ax.text(j, i, s, color=c, ha=\"center\", va=\"center\")\n",
" fig.suptitle(title, y=0)\n",
" fig.tight_layout()
"
" fig.tight_layout()"
]
},
{
...
...
@@ -691,15 +699,15 @@
" missing = np.where(digit == -1)\n",
" for y, x in zip(*missing): # Pull off x/y position of pixel\n",
" # Do not want negative indices in slice\n",
" x_start = max(0, x
-
1)\n",
" y_start = max(0, y
-
1)\n",
" x_start = max(0, x
-
1)\n",
" y_start = max(0, y
-
1)\n",
" # No harm in index larger than size\n",
" x_end = x
+
2\n",
" y_end = y
+
2\n",
" x_end = x
+
2\n",
" y_end = y
+
2\n",
" # What if another -1 is in region? Remove all the -1s\n",
" region = digit[y_start:y_end, x_start:x_end].flatten()\n",
" region = region[region >=0]\n",
" total = np.sum(region)
\n",
" region = region[region >=
0]\n",
" total = np.sum(region)\n",
" avg = total // region.size\n",
" digit[y, x] = avg\n",
" return digit"
...
...
@@ -747,7 +755,9 @@
"for n in range(new.shape[0]):\n",
" new[n] = fill_missing(digits[n])\n",
"\n",
"show_digits(digits, title=\"Digits with missing pixels\"),show_digits(new, title=\"Digits with imputed pixels\")"
"show_digits(digits, title=\"Digits with missing pixels\"), show_digits(\n",
" new, title=\"Digits with imputed pixels\"\n",
")"
]
},
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment