neuralinfo · daveatrandom · Mar 13, 2015
diff --git a/Assignment3_1_1.py b/Assignment3_1_1.py
@@ -0,0 +1,82 @@
+#! /usr/bin/env python
+#
+# David Paculdo
+# W205
+# Assignment 3
+
+import sys
+import pymongo
+import tweepy
+import signal
+import json
+import os
+import string
+import time
+
+from tweepy import Stream
+from tweepy import OAuthHandler
+from tweepy.streaming import StreamListener
+
+ckey = os.environ.get("twitter_consumer_key");
+csecret = os.environ.get("twitter_consumer_secret");
+
+atoken = os.environ.get("twitter_access_token");
+asecret = os.environ.get("twitter_access_token_secret");
+
+
+# Defaults for Twitter Stream
+try:
+	termList=str(sys.argv[1]) #term list must be in quotes on command line e.g. "#microsoft,#mojang"
+	print "Search terms: "+termList
+except:
+	print "Usage: Assignment3_1_1.py \"[search terms separated by comma]\""
+
+
+# DB name for mongodb
+db_name="db_streamT"
+
+# Instance of the Twitter stream listener
+class listener(StreamListener):
+    def __init__(self, api):
+        self.api = api
+        super(StreamListener, self).__init__()
+
+        #change name of client and database to what we want it to be
+        self.db = pymongo.MongoClient()
+        self.db = self.db[db_name]
+
+    def on_data(self, data):
+        data = json.loads(data)
+        #data=data.encode("utf-8")
+
+        text=filter(lambda x: x in string.printable, data['text'])
+        print text
+
+        # Writing to mongodb. Can change tweets collection to something else
+        self.db.tweets.insert(data)
+
+        return True
+
+    def on_error(self, status):
+        print >> sys.stderr, 'Encountered error with status code:', status
+        if status==420:
+            time.sleep(600)
+        return True
+
+    def on_timeout(self):
+        print >> sys.stderr, 'Timeout...'
+        return True
+
+
+#Begin Twitter stream access
+auth = OAuthHandler(ckey, csecret)
+auth.set_access_token(atoken, asecret)
+api=tweepy.API(auth, retry_count=5, retry_delay=5, retry_errors=set([401,404,500,503]),timeout=120)
+
+
+try:
+    twitterStream = Stream(auth, listener(api))
+    twitterStream.filter(track=[termList])
+except KeyboardInterrupt:
+    print("Interrupt called")
+    sys.exit()
diff --git a/Assignment3_1_2.py b/Assignment3_1_2.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# David Paculdo
+# W205
+# Assignment 3
+
+from boto.s3.connection import S3Connection
+from boto.s3.key import Key
+import os
+import pymongo
+
+
+#Amazon AWS variables
+AWS_KEY=os.environ.get("AWS_ACCESS_KEY")
+AWS_SECRET=os.environ.get("AWS_SECRET_KEY")
+
+
+#Connection to AWS
+conn = S3Connection(AWS_KEY, AWS_SECRET)
+#bucket must already be created
+bucket = conn.get_bucket("w205-assignment-2-dpaculdo")
+tmpfile="temp_from_s3"
+
+k=Key(bucket)
+k.key="microsoft_OR_mojang_2015-02-07_2015-02-14_tweets_0.txt"
+
+k.get_contents_to_filename(tmpfile)
+my_file=open(tmpfile,"r")
+
+
+#mongodb variables
+db_name="db_tweets"
+coll="tweets"
+
+
+#mongodb connection
+conn=pymongo.MongoClient()
+db=conn[db_name]
+collection=db[coll]
+
+
+#Insert into mongodb. Replace "\n" with space.
+for line in my_file:
+	collection.insert({"tweet":line.replace("\n"," ")})
+
+
+#Clean up
+os.remove(tmpfile)
diff --git a/Assignment3_1_2b.py b/Assignment3_1_2b.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# David Paculdo
+# W205
+# Assignment 3
+
+from boto.s3.connection import S3Connection
+from boto.s3.key import Key
+import os
+import pymongo
+import string
+import ast
+
+
+#Amazon AWS variables and connection
+AWS_KEY=os.environ.get("AWS_ACCESS_KEY")
+AWS_SECRET=os.environ.get("AWS_SECRET_KEY")
+
+conn = S3Connection(AWS_KEY, AWS_SECRET)
+bucket = conn.get_bucket("w205-assignment-2-dpaculdo")
+
+k=Key(bucket)
+
+filecount=0
+#hardcoded rawfile to be transferred
+rawfile="microsoft_OR_mojang_2015-02-07_2015-02-14_"+str(filecount)+".raw"
+k.key=rawfile
+
+
+#mongodb variables and connection
+db_name="db_restT"
+coll="tweets"
+conn=pymongo.MongoClient()
+db=conn[db_name]
+collection=db[coll]
+
+
+#Iterates through all raw Twitter data files from S3 and inserts into mongodb
+while k.exists():
+	k.get_contents_to_filename(rawfile)
+	my_file=open(rawfile,"r")
+
+	for line in my_file:
+		#print line
+		linedict=ast.literal_eval(line)
+		collection.insert(linedict)
+
+	my_file.close()
+	os.remove(rawfile)
+
+	filecount+=1
+	rawfile="microsoft_OR_mojang_2015-02-07_2015-02-14_"+str(filecount)+".raw"
+	k.key=rawfile
+
diff --git a/Assignment3_2_1.py b/Assignment3_2_1.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# David Paculdo
+# W205
+# Assignment 3
+
+import os
+import pymongo
+import string
+from collections import Counter
+
+
+#Global program variables
+db_name="db_tweets"
+db_name2="db_restT"
+db_name3="db_followers"
+
+coll="tweets"
+users="userlist"
+conn=pymongo.MongoClient()
+
+
+#Make sure database and collection are accessible:
+try:
+	db=conn[db_name]
+	collection=db[coll]
+
+	db2=conn[db_name2]
+	collection2=db2[coll]
+
+	db3=conn[db_name3]
+	collection3=db3[users]
+except:
+	print "dbname or collection does not exist"
+	sys.exit()
+
+
+#Create empty list and file to write most common re-tweets
+my_file=open("most_retweeted.txt","w")
+tweet_list=[]
+prev_tweet=""
+
+#Search for retweets
+for tweets in collection.find():
+	text=filter(lambda x: x in string.printable, tweets['tweet'])
+	text=text.replace("\n","")
+
+	if text.startswith("RT "):
+		tweet_list.append(text)
+
+#Count and sort retweet list to find the 30 most common retweets.
+tweet_count=Counter(tweet_list)
+tweet_most=tweet_count.most_common(30)
+
+
+#Block to find information on the users who retweeted the 30 most common retweets
+#Then insert into mongodb
+for tweet,count in tweet_most:
+	for tweets in collection2.find():
+		text=filter(lambda x: x in string.printable, tweets['text'])
+		text=text.replace("\n","")
+
+		if tweet==text:
+			userinfo=tweets['user']	
+			screen_name=userinfo['screen_name']
+			followers_count=userinfo['followers_count']
+			uid=userinfo['id']
+			location=userinfo['location']
+
+			if text!=prev_tweet:
+				print text
+				my_file.write(text+"\n")
+				prev_tweet=text
+			collection3.insert({"username":screen_name.encode("utf-8"), "user_id":uid, "follower_count":followers_count, "location":location.encode("utf-8")})
diff --git a/Assignment3_2_2.py b/Assignment3_2_2.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# David Paculdo
+# W205
+# Assignment 3
+
+import os
+import pymongo
+import string
+import sys
+
+from tweepy import Stream
+from tweepy import OAuthHandler
+from tweepy.streaming import StreamListener
+
+from nltk.tokenize import RegexpTokenizer
+
+#Defining the tokenizer
+#used to clean up the tweets a little bit
+tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
+
+#Variables for mongodb
+db_name="db_streamT"
+coll1="tweets"
+coll2="word_db"
+coll3="lexical_db"
+
+
+#Connecting to mongodb and make sure collections are accessible
+conn=pymongo.MongoClient()
+try:
+	db=conn[db_name]
+	collection=db[coll1]
+except:
+	print "dbname or collection does not exist"
+	sys.exit()
+
+#Variables for lexical diversity
+total_words=0
+word_count={}
+
+
+#Block to determine the lexical diversity of the tweets
+for data in collection.find():
+
+	#cleans up any non-printable characters
+	text=filter(lambda x: x in string.printable, data['text'])
+	text=text.replace("\n","")
+	text=string.lower(text)
+
+	tokens=tokenizer.tokenize(text)
+	total_words += len(tokens)
+	for word in tokens:
+		if word in word_count:
+			word_count[word]+=1
+		else:
+			word_count[word]=1
+
+
+#Calculation of the lexical diversity
+unique_words=len(set(word_count))
+lexical_diversity=float(unique_words)/total_words
+
+#Writes lexical diversity to file
+my_file=open("lexical_diversity.txt","w")
+my_file.write("total number of words: %i\n" %(total_words))
+my_file.write("total number of unique words: %i\n" %(unique_words))
+my_file.write("lexical diversity of corpus: %f\n" %(lexical_diversity))
+
+
+#Insert words and number of times the word occurred into mongodb
+words=db[coll2]
+for key, value in word_count.iteritems():
+	words.insert({"word":key, "count":value})
+
+
+#Insert lexical diversity numbers into mongodb
+lexical=db[coll3]
+lexical.insert({"total_words":total_words, "unique_words":unique_words, "lexical_diversity":lexical_diversity})