Skip to content

Assignment3 final #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions 1_1/1_1_concatenate_into_streamT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#Kasane Utsumi - 3/14/2015
#1_1_concatenate_into_streamT.py
#This code dumps all seven collections starting with db_streamT%StartDate% (%StartDate% is a passed command line argument) into db_streamT

import os
import json
import pymongo
import sys
from bson.json_util import dumps

import signal

def interrupt(signum, frame):
print "Interrupted, closing ..."
exit(1)

try:
mongoConnection = pymongo.MongoClient()
except:
print "Connection failed"
exit()

#get tables
db_streamT = mongoConnection['twitter_analyzer'].db_streamT

#clear table
db_streamT.drop()

stream1 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[1]]
stream2 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[2]]
stream3 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[3]]
stream4 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[4]]
stream5 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[5]]
stream6 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[6]]
stream7 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[7]]

#get total count for all collection so I can compare with count of db_streamT after filling it up so I know that concatenation was successful.
individualTotal = stream1.find().count() + stream2.find().count() + stream3.find().count() + stream4.find().count() + stream5.find().count() + stream6.find().count() + stream7.find().count()


#clear the current content
db_streamT.drop()

def addThisCollection(collection):
for content in collection.find():
db_streamT.insert(content)

addThisCollection(stream1)
addThisCollection(stream2)
addThisCollection(stream3)
addThisCollection(stream4)
addThisCollection(stream5)
addThisCollection(stream6)
addThisCollection(stream7)

print "individual total is " + str(individualTotal)
print "StreamT length is " + str(db_streamT.find().count())
print "Number of items match?: " + str(individualTotal == db_streamT.find().count())

59 changes: 59 additions & 0 deletions 1_1/1_1_concatenate_into_streamT.py~
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#Kasane Utsumi - 3/14/2015
#1_1_concatenate_into_streamT.py
#This code dumps all seven collections starting with db_streamT%StartDate% (%StartDate% is a passed command line argument) into db_streamT

import os
import json
import pymongo
import sys
from bson.json_util import dumps

import signal

def interrupt(signum, frame):
print "Interrupted, closing ..."
exit(1)

try:
mongoConnection = pymongo.MongoClient()
except:
print "Connection failed"
exit()

#get tables
db_streamT = mongoConnection['twitter_analyzer'].db_streamT

#clear table
db_streamT.drop()

stream1 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[1]]
stream2 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[2]]
stream3 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[3]]
stream4 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[4]]
stream5 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[5]]
stream6 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[6]]
stream7 = mongoConnection['twitter_analyzer']['db_streamT' + sys.argv[7]]

#get total count for all collection so I can compare with count of db_streamT after filling it up so I know that concatenation was successful.
individualTotal = stream1.find().count() + stream2.find().count() + stream3.find().count() + stream4.find().count() + stream5.find().count() + stream6.find().count() + stream7.find().count()


#clear the current content
db_streamT.drop()

def addThisCollection(collection):
for content in collection.find():
db_streamT.insert(content)

addThisCollection(stream1)
addThisCollection(stream2)
addThisCollection(stream3)
addThisCollection(stream4)
addThisCollection(stream5)
addThisCollection(stream6)
addThisCollection(stream7)

print "individual total is " + str(individualTotal)
print "StreamT length is " + str(db_streamT.find().count())
print "Number of items match?: " + str(individualTotal == db_streamT.find().count())

59 changes: 59 additions & 0 deletions 1_1/1_1_store_tweets_into_db_streamT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#Kasane Utsumi - 3/14/2015
#1_1_store_tweets_into_db_streamT.py
#This file retrieves tweets by search term “"#microsoft OR #mojang" specified by date range and dumps them into db_streamT%StartDate%(replace ‘%StartDate%’ with start date that was passed to the command.) database.

import tweepy
import urllib
import pymongo
import sys
import json
from bson.json_util import dumps
import signal

def interrupt(signum, frame):
print "Interrupted, closing ..."
exit(1)


#configure tweepy
consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth_handler=auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)

#get string to search for and start and end date from urllib
q = urllib.quote_plus("#microsoft OR #mojang")
start = urllib.quote_plus(sys.argv[1])
end = urllib.quote_plus(sys.argv[2])

#set up mongodb collection
try:
mongoConnection = pymongo.MongoClient()
except:
print "Connection failed"
exit()

#get tables - will create separate table for each day and merge all of the tables into db_streamT later.
#I will do this so that if error happens while getting tweets for one day I don't have to run the program
# for entire week again.
database = mongoConnection['twitter_analyzer']
db_streamT = database['db_streamT' + start]

#clean table is there are any data
db_streamT.drop()

# Additional query parameters:
# since: {date}
# until: {date}
# Just add them to the 'q' variable: q+" since: 2014-01-01 until: 2014-01-02"

try:
for tweet in tweepy.Cursor(api.search,q=q+" since:" + start + " until:" + end).items():
db_streamT.insert(tweet._json)
except:
print "tweet retrieval failed, exiting"
exit()
59 changes: 59 additions & 0 deletions 1_1/1_1_store_tweets_into_db_streamT.py~
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#Kasane Utsumi - 3/14/2015
#1_1_store_tweets_into_db_streamT.py
#This file retrieves tweets by search term “"#microsoft OR #mojang" specified by date range and dumps them into db_streamT%StartDate%(replace ‘%StartDate%’ with start date that was passed to the command.) database.

import tweepy
import urllib
import pymongo
import sys
import json
from bson.json_util import dumps
import signal

def interrupt(signum, frame):
print "Interrupted, closing ..."
exit(1)


#configure tweepy
consumer_key = "10G4NlBUpM9nusmE9nSoeGQnk"
consumer_secret = "KcH2Ykf253L0tTCuzIyqDUPnkEZ7mZhIiHCYiS84LbZNCsQwRu"
access_token = "2988143343-waN3T7DFy7j0Yn95hDdXOMLpdRfHzG66SnOZlHO"
access_token_secret = "TDd8WId2f7Cw8jDLdPcjJRM5lTlMGYiuLjUl1ped21euS"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth_handler=auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)

#get string to search for and start and end date from urllib
q = urllib.quote_plus("#microsoft OR #mojang")
start = urllib.quote_plus(sys.argv[1])
end = urllib.quote_plus(sys.argv[2])

#set up mongodb collection
try:
mongoConnection = pymongo.MongoClient()
except:
print "Connection failed"
exit()

#get tables - will create separate table for each day and merge all of the tables into db_streamT later.
#I will do this so that if error happens while getting tweets for one day I don't have to run the program
# for entire week again.
database = mongoConnection['twitter_analyzer']
db_streamT = database['db_streamT' + start]

#clean table is there are any data
db_streamT.drop()

# Additional query parameters:
# since: {date}
# until: {date}
# Just add them to the 'q' variable: q+" since: 2014-01-01 until: 2014-01-02"

try:
for tweet in tweepy.Cursor(api.search,q=q+" since:" + start + " until:" + end).items():
db_streamT.insert(tweet._json)
except:
print "tweet retrieval failed, exiting"
exit()
42 changes: 42 additions & 0 deletions 1_2/1_2_fill_db_tweets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#Kasane Utsumi - 3/14/2015
#1_2_fill_db_tweets.py
#This file takes all tweets (with entire tweet information in json format) from db_streamT and stores tweet text ONLY into db_tweets collection.

import os
import json
import pymongo
from bson.json_util import dumps
import signal

def interrupt(signum, frame):
print "Interrupted, closing ..."
exit(1)

signal.signal(signal.SIGINT, interrupt)

try:
mongoConnection = pymongo.MongoClient()
except:
print "Connection failed"
exit()

#get tables
db_streamT = mongoConnection['twitter_analyzer'].db_streamT

if db_streamT == None
print "db_streamT not found! exiting..."
exit()

db_tweets = mongoConnection['twitter_analyzer'].db_tweets

#clear the current content
db_tweets.drop()

#extract tweet from tweet json
for tJson in db_streamT.find():
#print json.loads(dumps(tJson["text"])).encode('utf8')
tweetOnlyEntry = {"text" : json.loads(dumps(tJson["text"]))}
db_tweets.insert(tweetOnlyEntry)

#check that addition happened fine
print "Does length of db_streamT equal that of db_tweets?" + str(db_streamT.find().count() == db_tweets.find().count())
42 changes: 42 additions & 0 deletions 1_2/1_2_fill_db_tweets.py~
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#Kasane Utsumi - 3/14/2015
#1_2_fill_db_tweets.py
#This file takes all tweets (with entire tweet information in json format) from db_streamT and stores tweet text ONLY into db_tweets collection.

import os
import json
import pymongo
from bson.json_util import dumps
import signal

def interrupt(signum, frame):
print "Interrupted, closing ..."
exit(1)

signal.signal(signal.SIGINT, interrupt)

try:
mongoConnection = pymongo.MongoClient()
except:
print "Connection failed"
exit()

#get tables
db_streamT = mongoConnection['twitter_analyzer'].db_streamT

if db_streamT == None
print "db_streamT not found! exiting..."
exit()

db_tweets = mongoConnection['twitter_analyzer'].db_tweets

#clear the current content
db_tweets.drop()

#extract tweet from tweet json
for tJson in db_streamT.find():
#print json.loads(dumps(tJson["text"])).encode('utf8')
tweetOnlyEntry = {"text" : json.loads(dumps(tJson["text"]))}
db_tweets.insert(tweetOnlyEntry)

#check that addition happened fine
print "Does length of db_streamT equal that of db_tweets?" + str(db_streamT.find().count() == db_tweets.find().count())
Binary file added 2_1/.hello.txt.swp
Binary file not shown.
Loading