-
Notifications
You must be signed in to change notification settings - Fork 16
Mini Project 3 #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 13 commits
bf57115
6f4d946
0d5b6e4
9336118
8da78a2
d766343
e082fe6
ed1d203
37fc4aa
468bbe2
4213dd7
e82a465
2be8e06
780d271
f8866c7
338edf0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,12 @@ | ||
| # TextMining | ||
| To run this program you will need to download the GitHub repository as well as the Natural Language Toolkit (NKTL), the requests package and the Vader sentiment analysis package | ||
|
|
||
| This is the base repo for the text mining and analysis project for Software Design at Olin College. | ||
| The requests package is used to request date from the internet. I can be downloaded by typing pip install requests into the windows command line. | ||
|
|
||
| The Vader sentiment analysis package contains tools for preforming sentiment analysis. It can be downloaded by typing pip install vaderSentiment into the windows command line. | ||
|
|
||
| The natural language toolkit has a wide variety of tools related to language. To download the NLTK simply type pip install nltk into your windows command line. | ||
|
|
||
| After all of the appropriate packages are downloaded the go to the repository and type in python minebooks2.py into the command line. | ||
| To run this program you will need to download the GitHub repository as well as the Natural Language Toolkit (NKTL), the requests package and the Vader sentiment analysis package | ||
|
|
||
| The report for this project can be found at https://github.com/hthomas60/TextMining/blob/master/Report2.pdf | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| import pickle | ||
| import requests | ||
| import codecs | ||
|
|
||
| def loadbooks(): | ||
| """ | ||
| Loads books from gutenberg.org. Book id has to be manualy changed each book. | ||
| """ | ||
| downloaded_book = requests.get('http://www.gutenberg.org/ebooks/1522.txt.utf-8').text | ||
| return downloaded_book | ||
|
|
||
| def savebook(book_text, filename): | ||
| """ | ||
| Saves a the text of a book into a file. | ||
| """ | ||
| f = open(filename, 'wb') | ||
| pickle.dump(book_text, f) | ||
| f.close() | ||
|
|
||
| def opensavedbook(file): | ||
| """ | ||
| Opens a file that is saved on the computer | ||
| """ | ||
| input_file = open(file, 'rb') | ||
| opened_text = pickle.load(input_file) | ||
| return opened_text | ||
|
|
||
|
|
||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,205 @@ | ||
| from loadBooks import * | ||
| import string | ||
| from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | ||
| import nltk | ||
| from collections import defaultdict #frequwords | ||
| import random | ||
| import operator | ||
|
|
||
|
|
||
| def removegutnburg(text): | ||
| """ | ||
| Removes the Gutenberg license information for so that the text can | ||
| be analyzed. | ||
| """ | ||
|
|
||
| licence = "ject Gutenberg Association / Carnegie Mellon University" #last line in license | ||
| bookstart = "by William Shakespeare" # set to Act V to | ||
| bookend = "THE END" | ||
| start_pos = text.find(licence) + 60 #finds the end of the licensing agreement | ||
| start_reading = text.find(bookstart,start_pos) + 22 #all of the books start with "title" by William Shakespeare | ||
| end_reading = text.find(bookend,start_reading ) #finds THE END at the end of the book | ||
| return (text[start_reading:end_reading]) #returns text of the play | ||
|
|
||
|
|
||
| def loadjustbooks(playfiles): | ||
| """ | ||
| Takes a list of play names and a list of text files of those plays. | ||
| Removes Gutenberg license, newlines and punctuation from the play text. | ||
| Ex. Makes a list of all of the text of the comedic plays | ||
| """ | ||
|
|
||
| plays = [] | ||
|
|
||
|
|
||
| for i in playfiles: | ||
| play = opensavedbook(i) #opens play text file | ||
| just_play = removegutnburg(play) #removes gutenberge license | ||
| just_play = RemoveAllButLettersAndSpaces(just_play) #removes newlines and punctuation | ||
| plays.append(just_play) #adds current play to the play list | ||
|
|
||
| return plays #returns list of plays | ||
|
|
||
| def RemoveAllButLettersAndSpaces(mystring): | ||
| """ | ||
| removes special characters and punctuation from play texts. | ||
| >>> RemoveAllButLettersAndSpaces("\r\nqwetr.,") | ||
| 'qwe2tr' | ||
| """ | ||
| mystring = mystring.replace('\r', ' ') | ||
| mystring = mystring.replace('\n', ' ') | ||
| mystring = mystring.replace('\'', ' ') | ||
| mystring = mystring.replace('[', '') | ||
| mystring = mystring.replace(']', '') | ||
| mystring = mystring.replace('.', '') | ||
| mystring = mystring.replace('?', '') | ||
| mystring = mystring.replace('!', '') | ||
| mystring = mystring.replace(';', '') | ||
| mystring = mystring.replace(':', '') | ||
| mystring = mystring.replace('-', '') | ||
| mystring = mystring.replace('\"', '') | ||
| mystring = mystring.replace(',', '') | ||
| return mystring.lower() #makes everything lower case words like "The" and "the" are the same | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a cleaner way to do this with a for loop. Make a list of all the characters you want to eliminate and iterate through that list rather than writing down every single character as a distinct line. |
||
|
|
||
| def RunSentAnalysis(mylist): | ||
| """ | ||
| runs sentimient analysis and returns positive and negitive sentiments in a list | ||
| """ | ||
| res = [] | ||
| analyzer = SentimentIntensityAnalyzer() | ||
| answer = analyzer.polarity_scores(mylist) | ||
| res.append(answer["pos"]) | ||
| res.append(answer["neg"]) | ||
| return res | ||
|
|
||
|
|
||
| def Most_Common(text): | ||
| """ | ||
| takes a string and returns the 25 most common words in the string and returns a list of those words | ||
| """ | ||
| top_words = [] | ||
| freqwords = defaultdict(int) #initializes a dictionary | ||
|
|
||
| for words in text.split(): #goes through a list of all the words in the string text | ||
| freqwords[words] += 1 #if a word is found add 1 to the counter | ||
| sortedwords = sorted(freqwords.items(), key=operator.itemgetter(1), reverse = True) #sorts dictionary of words based on how many times the word was found in reverse order. | ||
| for i in range(25): | ||
| top_words.append(sortedwords[i][0]) #store the 25 most common words | ||
| return (top_words) #Rreturns a list of the 24 most common words | ||
|
|
||
| def commonOverAll(list1,list2,list3): | ||
| """ | ||
| returns a list of all the most universally common elimments of three lists. In the program this is used to find all of universaly | ||
| common words in the three play types. | ||
| """ | ||
|
|
||
| return( list(set(list(set(list1).intersection(list2))).intersection(list3))) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor detail, but there's probably a cleaner way to do this. |
||
|
|
||
| def removewords(words,wordstoremove): | ||
| """ | ||
| Removes words from the plays that are univeraly common amonge all types | ||
| """ | ||
| words = words.split() | ||
| return ([x for x in words if x not in wordstoremove]) #return elements in words that are not in wordstoremove | ||
|
|
||
| def listtostring(mylist): | ||
| """ | ||
| convers a list of words to string of words | ||
| """ | ||
| return ' '.join(mylist) | ||
|
|
||
| def linklists(mylist): | ||
| """ | ||
| Add several list together into one large list | ||
| """ | ||
| res = [] | ||
| for i in range(len(mylist)): | ||
| res += [mylist[i]] | ||
| return listtostring(res) | ||
|
|
||
| def sampling(mylist, trials): | ||
| """ | ||
| Takes a random sample of ten words from my list and preforms | ||
| sentiment analysis on them. Averages all the positive and | ||
| the negative across a many trials. Returns the list with | ||
| the average positive and negative intensities. | ||
| """ | ||
| pos = 0 | ||
| neg = 0 | ||
| for i in range(trials): | ||
| sample = listtostring(random.sample(mylist,10)) | ||
| sentiment = RunSentAnalysis(sample) | ||
| pos += sentiment[0] | ||
| neg += sentiment[1] | ||
|
|
||
| return[pos/trials, neg/trials] | ||
|
|
||
|
|
||
| def textmining(): # Main function that runs the textmining code. | ||
| """ | ||
| 13 of Shakespeare’s plays were saved from gutenberg.org. | ||
| I sorted the file names of all of the plays into tree lists | ||
| comedies, tragedies, and histories. | ||
| """ | ||
|
|
||
| comedies = ['A_Midsummer_Nights_Dream.pickle', 'Alls_Well_That_Ends_Well.pickle'] | ||
| tragedies = ['Antony_and_Cleopatra.pickle'] | ||
| tragedies.append('Coriolanus.pickle') | ||
| tragedies.append('Cymbeline.pickle') | ||
| histories = ['King_Henry_IV.pickle'] | ||
| histories.append('King_John.pickle') | ||
| histories.append('King_Richard_II.pickle') | ||
|
||
|
|
||
|
|
||
| colletion = [] #list to store all three types of books | ||
| colletion.append(loadjustbooks(comedies)) #loads text from the comedies into the first element | ||
| colletion.append(loadjustbooks(tragedies)) #loads text from the trageties into the second element | ||
| colletion.append(loadjustbooks(histories)) #loads text from the histories into the second element | ||
| #collection was broken up into comedies, tragedies and | ||
| #histories to increase code readability | ||
| all_comedies = (linklists(colletion[0])) #combines all of the saved comedies into one list | ||
| all_trageties = (linklists(colletion[1])) #combines all of the saved trageties into one list | ||
| all_histories = (linklists(colletion[2])) #combines all of the saved histories into one list | ||
|
|
||
| common_comedies = (Most_Common(all_comedies)) #finds the most common words in Shakespeare’s comedies | ||
| common_trageties = (Most_Common(all_trageties)) #finds the most common words in Shakespeare’s trageties | ||
| common_histories = (Most_Common(all_histories)) #finds the most common words in Shakespeare’s histories | ||
|
|
||
| common_words = (commonOverAll(common_comedies,common_trageties, common_histories)) #make a list of words common along all three play types | ||
|
|
||
| comedy_uncommon = removewords(all_comedies,common_words) #removes the univeraly common words from the comedic plays | ||
| tragety_uncommon = removewords(all_trageties,common_words) #removes the universally common words from the tragic plays | ||
| history_uncommon = removewords(all_histories,common_words) #removes the universally common words from the historic plays | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
| print("\n") | ||
| print("Sentiment Analysis Average of Comedic Plays") | ||
| average = sampling(comedy_uncommon,500) #preform Sentiment Analyses on all three play types | ||
| print (average) | ||
| print("Sentiment Analysis of Tragic Plays") | ||
| average = sampling(tragety_uncommon,500) #preform Sentiment Analyses on all three play types | ||
| print (average) | ||
| print("Sentiment Analysis of Historic Plays") | ||
| average = sampling(tragety_uncommon,500) #preform Sentiment Analyses on all three play types | ||
| print (average) | ||
| #RunSentAnalysis(tragic_string) | ||
| #print("\n") | ||
| #print("Sentiment Analysis of Historic Plays") | ||
| #RunSentAnalysis(historic_string) | ||
|
|
||
|
|
||
|
|
||
|
|
||
| textmining() | ||
| #if __name__ == "__main__": | ||
| # import doctest | ||
| #doctest.testmod() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are just generally a few typos in this file, and this sentence is repeated at the bottom. I would suggest putting the lines that should go in the command line between `` because that's good markdown formatting. (ie: pip install vaderSentiment <= would look like =>
pip install vaderSentiment)