Skip to content

Assignment 3 - David Paculdo #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions Assignment3_1_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#! /usr/bin/env python
#
# David Paculdo
# W205
# Assignment 3

import sys
import pymongo
import tweepy
import signal
import json
import os
import string
import time

from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener

ckey = os.environ.get("twitter_consumer_key");
csecret = os.environ.get("twitter_consumer_secret");

atoken = os.environ.get("twitter_access_token");
asecret = os.environ.get("twitter_access_token_secret");


# Defaults for Twitter Stream
try:
termList=str(sys.argv[1]) #term list must be in quotes on command line e.g. "#microsoft,#mojang"
print "Search terms: "+termList
except:
print "Usage: Assignment3_1_1.py \"[search terms separated by comma]\""


# DB name for mongodb
db_name="db_streamT"

# Instance of the Twitter stream listener
class listener(StreamListener):
def __init__(self, api):
self.api = api
super(StreamListener, self).__init__()

#change name of client and database to what we want it to be
self.db = pymongo.MongoClient()
self.db = self.db[db_name]

def on_data(self, data):
data = json.loads(data)
#data=data.encode("utf-8")

text=filter(lambda x: x in string.printable, data['text'])
print text

# Writing to mongodb. Can change tweets collection to something else
self.db.tweets.insert(data)

return True

def on_error(self, status):
print >> sys.stderr, 'Encountered error with status code:', status
if status==420:
time.sleep(600)
return True

def on_timeout(self):
print >> sys.stderr, 'Timeout...'
return True


#Begin Twitter stream access
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
api=tweepy.API(auth, retry_count=5, retry_delay=5, retry_errors=set([401,404,500,503]),timeout=120)


try:
twitterStream = Stream(auth, listener(api))
twitterStream.filter(track=[termList])
except KeyboardInterrupt:
print("Interrupt called")
sys.exit()
49 changes: 49 additions & 0 deletions Assignment3_1_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# David Paculdo
# W205
# Assignment 3

from boto.s3.connection import S3Connection
from boto.s3.key import Key
import os
import pymongo


#Amazon AWS variables
AWS_KEY=os.environ.get("AWS_ACCESS_KEY")
AWS_SECRET=os.environ.get("AWS_SECRET_KEY")


#Connection to AWS
conn = S3Connection(AWS_KEY, AWS_SECRET)
#bucket must already be created
bucket = conn.get_bucket("w205-assignment-2-dpaculdo")
tmpfile="temp_from_s3"

k=Key(bucket)
k.key="microsoft_OR_mojang_2015-02-07_2015-02-14_tweets_0.txt"

k.get_contents_to_filename(tmpfile)
my_file=open(tmpfile,"r")


#mongodb variables
db_name="db_tweets"
coll="tweets"


#mongodb connection
conn=pymongo.MongoClient()
db=conn[db_name]
collection=db[coll]


#Insert into mongodb. Replace "\n" with space.
for line in my_file:
collection.insert({"tweet":line.replace("\n"," ")})


#Clean up
os.remove(tmpfile)
55 changes: 55 additions & 0 deletions Assignment3_1_2b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# David Paculdo
# W205
# Assignment 3

from boto.s3.connection import S3Connection
from boto.s3.key import Key
import os
import pymongo
import string
import ast


#Amazon AWS variables and connection
AWS_KEY=os.environ.get("AWS_ACCESS_KEY")
AWS_SECRET=os.environ.get("AWS_SECRET_KEY")

conn = S3Connection(AWS_KEY, AWS_SECRET)
bucket = conn.get_bucket("w205-assignment-2-dpaculdo")

k=Key(bucket)

filecount=0
#hardcoded rawfile to be transferred
rawfile="microsoft_OR_mojang_2015-02-07_2015-02-14_"+str(filecount)+".raw"
k.key=rawfile


#mongodb variables and connection
db_name="db_restT"
coll="tweets"
conn=pymongo.MongoClient()
db=conn[db_name]
collection=db[coll]


#Iterates through all raw Twitter data files from S3 and inserts into mongodb
while k.exists():
k.get_contents_to_filename(rawfile)
my_file=open(rawfile,"r")

for line in my_file:
#print line
linedict=ast.literal_eval(line)
collection.insert(linedict)

my_file.close()
os.remove(rawfile)

filecount+=1
rawfile="microsoft_OR_mojang_2015-02-07_2015-02-14_"+str(filecount)+".raw"
k.key=rawfile

75 changes: 75 additions & 0 deletions Assignment3_2_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# David Paculdo
# W205
# Assignment 3

import os
import pymongo
import string
from collections import Counter


#Global program variables
db_name="db_tweets"
db_name2="db_restT"
db_name3="db_followers"

coll="tweets"
users="userlist"
conn=pymongo.MongoClient()


#Make sure database and collection are accessible:
try:
db=conn[db_name]
collection=db[coll]

db2=conn[db_name2]
collection2=db2[coll]

db3=conn[db_name3]
collection3=db3[users]
except:
print "dbname or collection does not exist"
sys.exit()


#Create empty list and file to write most common re-tweets
my_file=open("most_retweeted.txt","w")
tweet_list=[]
prev_tweet=""

#Search for retweets
for tweets in collection.find():
text=filter(lambda x: x in string.printable, tweets['tweet'])
text=text.replace("\n","")

if text.startswith("RT "):
tweet_list.append(text)

#Count and sort retweet list to find the 30 most common retweets.
tweet_count=Counter(tweet_list)
tweet_most=tweet_count.most_common(30)


#Block to find information on the users who retweeted the 30 most common retweets
#Then insert into mongodb
for tweet,count in tweet_most:
for tweets in collection2.find():
text=filter(lambda x: x in string.printable, tweets['text'])
text=text.replace("\n","")

if tweet==text:
userinfo=tweets['user']
screen_name=userinfo['screen_name']
followers_count=userinfo['followers_count']
uid=userinfo['id']
location=userinfo['location']

if text!=prev_tweet:
print text
my_file.write(text+"\n")
prev_tweet=text
collection3.insert({"username":screen_name.encode("utf-8"), "user_id":uid, "follower_count":followers_count, "location":location.encode("utf-8")})
80 changes: 80 additions & 0 deletions Assignment3_2_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# David Paculdo
# W205
# Assignment 3

import os
import pymongo
import string
import sys

from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener

from nltk.tokenize import RegexpTokenizer

#Defining the tokenizer
#used to clean up the tweets a little bit
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

#Variables for mongodb
db_name="db_streamT"
coll1="tweets"
coll2="word_db"
coll3="lexical_db"


#Connecting to mongodb and make sure collections are accessible
conn=pymongo.MongoClient()
try:
db=conn[db_name]
collection=db[coll1]
except:
print "dbname or collection does not exist"
sys.exit()

#Variables for lexical diversity
total_words=0
word_count={}


#Block to determine the lexical diversity of the tweets
for data in collection.find():

#cleans up any non-printable characters
text=filter(lambda x: x in string.printable, data['text'])
text=text.replace("\n","")
text=string.lower(text)

tokens=tokenizer.tokenize(text)
total_words += len(tokens)
for word in tokens:
if word in word_count:
word_count[word]+=1
else:
word_count[word]=1


#Calculation of the lexical diversity
unique_words=len(set(word_count))
lexical_diversity=float(unique_words)/total_words

#Writes lexical diversity to file
my_file=open("lexical_diversity.txt","w")
my_file.write("total number of words: %i\n" %(total_words))
my_file.write("total number of unique words: %i\n" %(unique_words))
my_file.write("lexical diversity of corpus: %f\n" %(lexical_diversity))


#Insert words and number of times the word occurred into mongodb
words=db[coll2]
for key, value in word_count.iteritems():
words.insert({"word":key, "count":value})


#Insert lexical diversity numbers into mongodb
lexical=db[coll3]
lexical.insert({"total_words":total_words, "unique_words":unique_words, "lexical_diversity":lexical_diversity})
Loading