Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added A_Midsummer_Nights_Dream.pickle
Binary file not shown.
Binary file added Alls_Well_That_Ends_Well.pickle
Binary file not shown.
Binary file added Antony_and_Cleopatra.pickle
Binary file not shown.
Binary file added Comedy_Of_Errors.pickle
Binary file not shown.
Binary file added Coriolanus.pickle
Binary file not shown.
Binary file added Cymbeline.pickle
Binary file not shown.
Binary file added Hamlet.pickle
Binary file not shown.
Binary file added Julius_Caesar.pickle
Binary file not shown.
Binary file added King_Henry_IV.pickle
Binary file not shown.
Binary file added King_John.pickle
Binary file not shown.
Binary file added King_Richard_II.pickle
Binary file not shown.
Binary file added Loves_Labours_Lost.pickle
Binary file not shown.
Binary file added Measure_for_Measure.pickle
Binary file not shown.
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# TextMining
To run this program you will need to download the GitHub repository as well as the Natural Language Toolkit (NKTL), the requests package and the Vader sentiment analysis package
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are just generally a few typos in this file, and this sentence is repeated at the bottom. I would suggest putting the lines that should go in the command line between `` because that's good markdown formatting. (ie: pip install vaderSentiment <= would look like => pip install vaderSentiment)


This is the base repo for the text mining and analysis project for Software Design at Olin College.
The requests package is used to request date from the internet. I can be downloaded by typing pip install requests into the windows command line.

The Vader sentiment analysis package contains tools for preforming sentiment analysis. It can be downloaded by typing pip install vaderSentiment into the windows command line.

The natural language toolkit has a wide variety of tools related to language. To download the NLTK simply type pip install nltk into your windows command line.

After all of the appropriate packages are downloaded the go to the repository and type in python minebooks2.py into the command line.
To run this program you will need to download the GitHub repository as well as the Natural Language Toolkit (NKTL), the requests package and the Vader sentiment analysis package

The report for this project can be found at https://github.com/hthomas60/TextMining/blob/master/Report2.pdf
Binary file added Report.pdf
Binary file not shown.
Binary file added Report2.pdf
Binary file not shown.
29 changes: 29 additions & 0 deletions loadBooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pickle
import requests
import codecs

def loadbooks():
"""
Loads books from gutenberg.org. Book id has to be manualy changed each book.
"""
downloaded_book = requests.get('http://www.gutenberg.org/ebooks/1522.txt.utf-8').text
return downloaded_book

def savebook(book_text, filename):
"""
Saves a the text of a book into a file.
"""
f = open(filename, 'wb')
pickle.dump(book_text, f)
f.close()

def opensavedbook(file):
"""
Opens a file that is saved on the computer
"""
input_file = open(file, 'rb')
opened_text = pickle.load(input_file)
return opened_text



205 changes: 205 additions & 0 deletions minebooks2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
from loadBooks import *
import string
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from collections import defaultdict #frequwords
import random
import operator


def removegutnburg(text):
"""
Removes the Gutenberg license information for so that the text can
be analyzed.
"""

licence = "ject Gutenberg Association / Carnegie Mellon University" #last line in license
bookstart = "by William Shakespeare" # set to Act V to
bookend = "THE END"
start_pos = text.find(licence) + 60 #finds the end of the licensing agreement
start_reading = text.find(bookstart,start_pos) + 22 #all of the books start with "title" by William Shakespeare
end_reading = text.find(bookend,start_reading ) #finds THE END at the end of the book
return (text[start_reading:end_reading]) #returns text of the play


def loadjustbooks(playfiles):
"""
Takes a list of play names and a list of text files of those plays.
Removes Gutenberg license, newlines and punctuation from the play text.
Ex. Makes a list of all of the text of the comedic plays
"""

plays = []


for i in playfiles:
play = opensavedbook(i) #opens play text file
just_play = removegutnburg(play) #removes gutenberge license
just_play = RemoveAllButLettersAndSpaces(just_play) #removes newlines and punctuation
plays.append(just_play) #adds current play to the play list

return plays #returns list of plays

def RemoveAllButLettersAndSpaces(mystring):
"""
removes special characters and punctuation from play texts.
>>> RemoveAllButLettersAndSpaces("\r\nqwetr.,")
'qwe2tr'
"""
mystring = mystring.replace('\r', ' ')
mystring = mystring.replace('\n', ' ')
mystring = mystring.replace('\'', ' ')
mystring = mystring.replace('[', '')
mystring = mystring.replace(']', '')
mystring = mystring.replace('.', '')
mystring = mystring.replace('?', '')
mystring = mystring.replace('!', '')
mystring = mystring.replace(';', '')
mystring = mystring.replace(':', '')
mystring = mystring.replace('-', '')
mystring = mystring.replace('\"', '')
mystring = mystring.replace(',', '')
return mystring.lower() #makes everything lower case words like "The" and "the" are the same
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a cleaner way to do this with a for loop. Make a list of all the characters you want to eliminate and iterate through that list rather than writing down every single character as a distinct line.


def RunSentAnalysis(mylist):
"""
runs sentimient analysis and returns positive and negitive sentiments in a list
"""
res = []
analyzer = SentimentIntensityAnalyzer()
answer = analyzer.polarity_scores(mylist)
res.append(answer["pos"])
res.append(answer["neg"])
return res


def Most_Common(text):
"""
takes a string and returns the 25 most common words in the string and returns a list of those words
"""
top_words = []
freqwords = defaultdict(int) #initializes a dictionary

for words in text.split(): #goes through a list of all the words in the string text
freqwords[words] += 1 #if a word is found add 1 to the counter
sortedwords = sorted(freqwords.items(), key=operator.itemgetter(1), reverse = True) #sorts dictionary of words based on how many times the word was found in reverse order.
for i in range(25):
top_words.append(sortedwords[i][0]) #store the 25 most common words
return (top_words) #Rreturns a list of the 24 most common words

def commonOverAll(list1,list2,list3):
"""
returns a list of all the most universally common elimments of three lists. In the program this is used to find all of universaly
common words in the three play types.
"""

return( list(set(list(set(list1).intersection(list2))).intersection(list3)))
Copy link

@Elepert Elepert Oct 16, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor detail, but there's probably a cleaner way to do this.


def removewords(words,wordstoremove):
"""
Removes words from the plays that are univeraly common amonge all types
"""
words = words.split()
return ([x for x in words if x not in wordstoremove]) #return elements in words that are not in wordstoremove

def listtostring(mylist):
"""
convers a list of words to string of words
"""
return ' '.join(mylist)

def linklists(mylist):
"""
Add several list together into one large list
"""
res = []
for i in range(len(mylist)):
res += [mylist[i]]
return listtostring(res)

def sampling(mylist, trials):
"""
Takes a random sample of ten words from my list and preforms
sentiment analysis on them. Averages all the positive and
the negative across a many trials. Returns the list with
the average positive and negative intensities.
"""
pos = 0
neg = 0
for i in range(trials):
sample = listtostring(random.sample(mylist,10))
sentiment = RunSentAnalysis(sample)
pos += sentiment[0]
neg += sentiment[1]

return[pos/trials, neg/trials]


def textmining(): # Main function that runs the textmining code.
"""
13 of Shakespeare’s plays were saved from gutenberg.org.
I sorted the file names of all of the plays into tree lists
comedies, tragedies, and histories.
"""

comedies = ['A_Midsummer_Nights_Dream.pickle', 'Alls_Well_That_Ends_Well.pickle']
tragedies = ['Antony_and_Cleopatra.pickle']
tragedies.append('Coriolanus.pickle')
tragedies.append('Cymbeline.pickle')
histories = ['King_Henry_IV.pickle']
histories.append('King_John.pickle')
histories.append('King_Richard_II.pickle')
Copy link

@Elepert Elepert Oct 16, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little confused about why you appended these strings instead of just writing them out initially.
Do this:
tragedies = ['Antony_and_Cleopatra.pickle', 'Coriolanus.pickle', 'Cymbeline.pickle']
instead of:
tragedies = ['Antony_and_Cleopatra.pickle']
tragedies.append('Coriolanus.pickle')
tragedies.append('Cymbeline.pickle')



colletion = [] #list to store all three types of books
colletion.append(loadjustbooks(comedies)) #loads text from the comedies into the first element
colletion.append(loadjustbooks(tragedies)) #loads text from the trageties into the second element
colletion.append(loadjustbooks(histories)) #loads text from the histories into the second element
#collection was broken up into comedies, tragedies and
#histories to increase code readability
all_comedies = (linklists(colletion[0])) #combines all of the saved comedies into one list
all_trageties = (linklists(colletion[1])) #combines all of the saved trageties into one list
all_histories = (linklists(colletion[2])) #combines all of the saved histories into one list

common_comedies = (Most_Common(all_comedies)) #finds the most common words in Shakespeare’s comedies
common_trageties = (Most_Common(all_trageties)) #finds the most common words in Shakespeare’s trageties
common_histories = (Most_Common(all_histories)) #finds the most common words in Shakespeare’s histories

common_words = (commonOverAll(common_comedies,common_trageties, common_histories)) #make a list of words common along all three play types

comedy_uncommon = removewords(all_comedies,common_words) #removes the univeraly common words from the comedic plays
tragety_uncommon = removewords(all_trageties,common_words) #removes the universally common words from the tragic plays
history_uncommon = removewords(all_histories,common_words) #removes the universally common words from the historic plays





print("\n")
print("Sentiment Analysis Average of Comedic Plays")
average = sampling(comedy_uncommon,500) #preform Sentiment Analyses on all three play types
print (average)
print("Sentiment Analysis of Tragic Plays")
average = sampling(tragety_uncommon,500) #preform Sentiment Analyses on all three play types
print (average)
print("Sentiment Analysis of Historic Plays")
average = sampling(tragety_uncommon,500) #preform Sentiment Analyses on all three play types
print (average)
#RunSentAnalysis(tragic_string)
#print("\n")
#print("Sentiment Analysis of Historic Plays")
#RunSentAnalysis(historic_string)




textmining()
#if __name__ == "__main__":
# import doctest
#doctest.testmod()