Replace frequency.ipynb

6897019d · User expired · b50cfbc6 · 6897019d
Commit 6897019d authored 2 years ago by User expired
--- a/code_analysis/frequency.ipynb
+++ b/code_analysis/frequency.ipynb
@@ -123,12 +123,12 @@
    "font1 = {'family':'serif','color':'black','size':10}\n",
    "font2 = {'family':'serif','color':'black','size':8}\n",
    "\n",
-    "plt.title(\"Word Frequency Abstract\", fontdict = font1)\n",
+    "plt.title(\"Cyber Insurance: Word Frequency Abstract\", fontdict = font1)\n",
    "plt.ylabel(\"Number of Occurrences\", fontdict = font2)\n",
    "plt.xticks(rotation = 45, fontsize=8) # Rotates X-Axis Ticks by 45-degrees\n",
    "plt.bar(names, values)\n",
    "\n",
-    "plt.savefig('test400.png', dpi=400)#change dpi for image quality\n",
+    "plt.savefig('frequency_cyber_insurance_abstract.png', dpi=400, bbox_inches='tight')#change dpi for image resolution\n",
    "plt.show()"
   ]
  }

 %% Cell type:markdown id:bf76c08e-eff5-4215-b655-5de623c399b3 tags:

 This program **counts the word frequency** <br>

 inspired by: https://code.tutsplus.com/tutorials/counting-word-frequency-in-a-file-using-python--cms-25965

 %% Cell type:code id:50559021-c803-47a6-b3f3-bc2b5d7065da tags:

 ``` python
 # Install a pip package in the current Jupyter kernel
 import sys
 !{sys.executable} -m pip install matplotlib
 ```

 %% Cell type:code id:e9172b0e-d5be-483d-a670-08b5586b379e tags:

 ``` python
 import csv
 import re
 from matplotlib import pyplot as plt
 import nltk
 nltk.download("stopwords")
 ```

 %% Cell type:markdown id:238ce4bc-a9db-438d-b9e9-4a08eeb12dcf tags:

 row [1] = speaker name, row[2] = company name, row[3] = talk title, row[4] = abstract, ... <br>
 we remove stopwords with **nltk**, but you can add manually words to the blacklist as well

 %% Cell type:code id:f33da894-1c23-4bcd-81b3-4ec61bc22413 tags:

 ``` python
 with open(r"C:\Users\lukas\Downloads\speaker_cyber_insurance.csv", encoding='utf8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=';')
    line_count = 0
    frequency = {}
    stopwords = nltk.corpus.stopwords.words('english')
    #you can also manually remove some specific words:
    blacklisted = ['also', 'take']
    #read each row and count the frequency of each word
    for row in csv_reader:
        if line_count == 0:
            #first row are column names
            line_count += 1
        else:
            #print(f'\t{row[0]}, {row[1]}, {row[2]}.')
            #we only consider words which contain letters A to Z and which have a length between 3 and 20
            match_pattern = re.findall(r'\b[a-z]{3,20}\b', row[4])
            for word in match_pattern:
                if word not in blacklisted and word not in stopwords:
                    count = frequency.get(word,0)
                    frequency[word] = count + 1
            line_count += 1

    most_frequent = dict(sorted(frequency.items(), key=lambda elem: elem[1], reverse=True))
    most_frequent_count = most_frequent.keys()
 ```

 %% Cell type:markdown id:a7f4800d-e140-419b-9304-618f21390438 tags:

 Print words by frequency distribution

 %% Cell type:code id:11a15c93-929b-4483-b664-a3c644bcc607 tags:

 ``` python
 for words in most_frequent_count:
    print(words, most_frequent[words])

 print(f'Processed {line_count} lines.')
 ```

 %% Cell type:markdown id:5d386410-f8b7-4c75-bf33-036fc8e67336 tags:

 Below code creates bar plot of the 15 most frequent words

 %% Cell type:code id:93887178-fe93-43ea-91f1-b661cf6c1a43 tags:

 ``` python
 import itertools

 first_15 = dict(itertools.islice(most_frequent.items(),15))
 names = list(first_15.keys())
 values = list(first_15.values())

 font1 = {'family':'serif','color':'black','size':10}
 font2 = {'family':'serif','color':'black','size':8}

-plt.title("Word Frequency Abstract", fontdict = font1)
+plt.title("Cyber Insurance: Word Frequency Abstract", fontdict = font1)
 plt.ylabel("Number of Occurrences", fontdict = font2)
 plt.xticks(rotation = 45, fontsize=8) # Rotates X-Axis Ticks by 45-degrees
 plt.bar(names, values)

-plt.savefig('test400.png', dpi=400)#change dpi for image quality
+plt.savefig('frequency_cyber_insurance_abstract.png', dpi=400, bbox_inches='tight')#change dpi for image resolution
 plt.show()
 ```