Commit 0a3ae332 authored by Eva Zangerle's avatar Eva Zangerle
Browse files

added note on yelp dataset to notebook

parent 138dcb2d
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
"import os\n", "import os\n",
"from pprint import pprint\n", "from pprint import pprint\n",
"from sys import getsizeof\n", "from sys import getsizeof\n",
"\n",
"import matplotlib.pyplot as plt\n", "import matplotlib.pyplot as plt\n",
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
...@@ -33,9 +34,8 @@ ...@@ -33,9 +34,8 @@
"from sklearn import linear_model\n", "from sklearn import linear_model\n",
"from sklearn.cluster import DBSCAN, KMeans\n", "from sklearn.cluster import DBSCAN, KMeans\n",
"from sklearn.datasets import make_blobs\n", "from sklearn.datasets import make_blobs\n",
"from sklearn.feature_extraction import text\n", "from sklearn.feature_extraction import FeatureHasher, text\n",
"from sklearn.metrics import pairwise_distances_argmin\n", "from sklearn.metrics import pairwise_distances_argmin"
"from sklearn.feature_extraction import FeatureHasher"
] ]
}, },
{ {
...@@ -2347,6 +2347,15 @@ ...@@ -2347,6 +2347,15 @@
"In the following, we will look at different representations of text and feature extraction methods for text. We will make use of the Yelp Dataset (https://www.kaggle.com/yelp-dataset/yelp-dataset): \"This dataset is a subset of Yelp's businesses, reviews, and user data. It was originally put together for the Yelp Dataset Challenge which is a chance for students to conduct research or analysis on Yelp's data and share their discoveries. In the most recent dataset you'll find information about businesses across 8 metropolitan areas in the USA and Canada.\" This example is adapted from the FeatEng book." "In the following, we will look at different representations of text and feature extraction methods for text. We will make use of the Yelp Dataset (https://www.kaggle.com/yelp-dataset/yelp-dataset): \"This dataset is a subset of Yelp's businesses, reviews, and user data. It was originally put together for the Yelp Dataset Challenge which is a chance for students to conduct research or analysis on Yelp's data and share their discoveries. In the most recent dataset you'll find information about businesses across 8 metropolitan areas in the USA and Canada.\" This example is adapted from the FeatEng book."
] ]
}, },
{
"cell_type": "markdown",
"id": "78930164-4e89-426f-b153-efdc4a1cf7ef",
"metadata": {},
"source": [
"<div class=\"alert alert-block alert-info\">\n",
"<b>Note:</b> As the dataset is simply too large to store it on gitlab, please download the dataset directly using the link above, unzip it and store it in the data directory..</div>"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 90, "execution_count": 90,
...@@ -3423,8 +3432,8 @@ ...@@ -3423,8 +3432,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"feature_hasher = FeatureHasher(n_features=m, input_type='string')\n", "feature_hasher = FeatureHasher(n_features=m, input_type=\"string\")\n",
"hashed = feature_hasher.transform(reviews['business_id'])" "hashed = feature_hasher.transform(reviews[\"business_id\"])"
] ]
}, },
{ {
...@@ -3444,8 +3453,8 @@ ...@@ -3444,8 +3453,8 @@
], ],
"source": [ "source": [
"# We can see how this will make a difference in the future by looking at the size of each\n", "# We can see how this will make a difference in the future by looking at the size of each\n",
"print('Our pandas Series, in bytes: ', getsizeof(reviews['business_id']))\n", "print(\"Our pandas Series, in bytes: \", getsizeof(reviews[\"business_id\"]))\n",
"print('Our hashed numpy array, in bytes: ', getsizeof(hashed))" "print(\"Our hashed numpy array, in bytes: \", getsizeof(hashed))"
] ]
}, },
{ {
...@@ -3486,8 +3495,8 @@ ...@@ -3486,8 +3495,8 @@
} }
], ],
"source": [ "source": [
"# compare oriignal data to hashed data \n", "# compare oriignal data to hashed data\n",
"reviews['business_id'].unique().tolist()[0:5]\n", "reviews[\"business_id\"].unique().tolist()[0:5]\n",
"hashed.toarray()" "hashed.toarray()"
] ]
}, },
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment