Commit 0a3ae332 authored by Eva Zangerle's avatar Eva Zangerle
Browse files

added note on yelp dataset to notebook

parent 138dcb2d
......@@ -22,6 +22,7 @@
"import os\n",
"from pprint import pprint\n",
"from sys import getsizeof\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
......@@ -33,9 +34,8 @@
"from sklearn import linear_model\n",
"from sklearn.cluster import DBSCAN, KMeans\n",
"from sklearn.datasets import make_blobs\n",
"from sklearn.feature_extraction import text\n",
"from sklearn.metrics import pairwise_distances_argmin\n",
"from sklearn.feature_extraction import FeatureHasher"
"from sklearn.feature_extraction import FeatureHasher, text\n",
"from sklearn.metrics import pairwise_distances_argmin"
]
},
{
......@@ -2347,6 +2347,15 @@
"In the following, we will look at different representations of text and feature extraction methods for text. We will make use of the Yelp Dataset (https://www.kaggle.com/yelp-dataset/yelp-dataset): \"This dataset is a subset of Yelp's businesses, reviews, and user data. It was originally put together for the Yelp Dataset Challenge which is a chance for students to conduct research or analysis on Yelp's data and share their discoveries. In the most recent dataset you'll find information about businesses across 8 metropolitan areas in the USA and Canada.\" This example is adapted from the FeatEng book."
]
},
{
"cell_type": "markdown",
"id": "78930164-4e89-426f-b153-efdc4a1cf7ef",
"metadata": {},
"source": [
"<div class=\"alert alert-block alert-info\">\n",
"<b>Note:</b> As the dataset is simply too large to store it on gitlab, please download the dataset directly using the link above, unzip it and store it in the data directory..</div>"
]
},
{
"cell_type": "code",
"execution_count": 90,
......@@ -3423,8 +3432,8 @@
"metadata": {},
"outputs": [],
"source": [
"feature_hasher = FeatureHasher(n_features=m, input_type='string')\n",
"hashed = feature_hasher.transform(reviews['business_id'])"
"feature_hasher = FeatureHasher(n_features=m, input_type=\"string\")\n",
"hashed = feature_hasher.transform(reviews[\"business_id\"])"
]
},
{
......@@ -3444,8 +3453,8 @@
],
"source": [
"# We can see how this will make a difference in the future by looking at the size of each\n",
"print('Our pandas Series, in bytes: ', getsizeof(reviews['business_id']))\n",
"print('Our hashed numpy array, in bytes: ', getsizeof(hashed))"
"print(\"Our pandas Series, in bytes: \", getsizeof(reviews[\"business_id\"]))\n",
"print(\"Our hashed numpy array, in bytes: \", getsizeof(hashed))"
]
},
{
......@@ -3486,8 +3495,8 @@
}
],
"source": [
"# compare oriignal data to hashed data \n",
"reviews['business_id'].unique().tolist()[0:5]\n",
"# compare oriignal data to hashed data\n",
"reviews[\"business_id\"].unique().tolist()[0:5]\n",
"hashed.toarray()"
]
},
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment