Skip to content
Snippets Groups Projects
Commit 6897019d authored by User expired's avatar User expired
Browse files

Replace frequency.ipynb

parent b50cfbc6
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:bf76c08e-eff5-4215-b655-5de623c399b3 tags:
This program **counts the word frequency** <br>
inspired by: https://code.tutsplus.com/tutorials/counting-word-frequency-in-a-file-using-python--cms-25965
%% Cell type:code id:50559021-c803-47a6-b3f3-bc2b5d7065da tags:
``` python
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install matplotlib
```
%% Cell type:code id:e9172b0e-d5be-483d-a670-08b5586b379e tags:
``` python
import csv
import re
from matplotlib import pyplot as plt
import nltk
nltk.download("stopwords")
```
%% Cell type:markdown id:238ce4bc-a9db-438d-b9e9-4a08eeb12dcf tags:
row [1] = speaker name, row[2] = company name, row[3] = talk title, row[4] = abstract, ... <br>
we remove stopwords with **nltk**, but you can add manually words to the blacklist as well
%% Cell type:code id:f33da894-1c23-4bcd-81b3-4ec61bc22413 tags:
``` python
with open(r"C:\Users\lukas\Downloads\speaker_cyber_insurance.csv", encoding='utf8') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=';')
line_count = 0
frequency = {}
stopwords = nltk.corpus.stopwords.words('english')
#you can also manually remove some specific words:
blacklisted = ['also', 'take']
#read each row and count the frequency of each word
for row in csv_reader:
if line_count == 0:
#first row are column names
line_count += 1
else:
#print(f'\t{row[0]}, {row[1]}, {row[2]}.')
#we only consider words which contain letters A to Z and which have a length between 3 and 20
match_pattern = re.findall(r'\b[a-z]{3,20}\b', row[4])
for word in match_pattern:
if word not in blacklisted and word not in stopwords:
count = frequency.get(word,0)
frequency[word] = count + 1
line_count += 1
most_frequent = dict(sorted(frequency.items(), key=lambda elem: elem[1], reverse=True))
most_frequent_count = most_frequent.keys()
```
%% Cell type:markdown id:a7f4800d-e140-419b-9304-618f21390438 tags:
Print words by frequency distribution
%% Cell type:code id:11a15c93-929b-4483-b664-a3c644bcc607 tags:
``` python
for words in most_frequent_count:
print(words, most_frequent[words])
print(f'Processed {line_count} lines.')
```
%% Cell type:markdown id:5d386410-f8b7-4c75-bf33-036fc8e67336 tags:
Below code creates bar plot of the 15 most frequent words
%% Cell type:code id:93887178-fe93-43ea-91f1-b661cf6c1a43 tags:
``` python
import itertools
first_15 = dict(itertools.islice(most_frequent.items(),15))
names = list(first_15.keys())
values = list(first_15.values())
font1 = {'family':'serif','color':'black','size':10}
font2 = {'family':'serif','color':'black','size':8}
plt.title("Word Frequency Abstract", fontdict = font1)
plt.title("Cyber Insurance: Word Frequency Abstract", fontdict = font1)
plt.ylabel("Number of Occurrences", fontdict = font2)
plt.xticks(rotation = 45, fontsize=8) # Rotates X-Axis Ticks by 45-degrees
plt.bar(names, values)
plt.savefig('test400.png', dpi=400)#change dpi for image quality
plt.savefig('frequency_cyber_insurance_abstract.png', dpi=400, bbox_inches='tight')#change dpi for image resolution
plt.show()
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment