-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwriteCSV.py
114 lines (91 loc) · 3.96 KB
/
writeCSV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# *****************************************************************************
#
# Input tweets saved as either a JSON or a Python Pickle file and write the
# Twitter ID, poster's name and screen_name, timestamp, number of tweets,
# number of favorites, and the tweet text as a comma separated value file
# (CSV). Tested with Python 2.7.9
#
# Simplest usage: python writeCSV.py INPUT-FILENAME
#
# options:
# -h | --help Print help information
# -c | --csv CSV-FILENAME
# -s | --sort Sort the tweets in descending tweet ID
# -a | --ascend Use ascending order if sort is selected
#
# Keith Eric Grant ([email protected])
# 13 Mar 2015
#
# *****************************************************************************
import sys
import os
import argparse
import simplejson as json
import codecs
import pickle
import re
myName = 'writeCSV.py'
version = '1.0.0'
def main(argv=None):
if argv is None:
argv = sys.argv
parser = argparse.ArgumentParser(
prog=myName,
description='Write a CSV summary of collected tweets',
version=version)
parser.add_argument('inpfile', action='store', help='Input tweet filename (.json or .pickle)')
parser.add_argument('--csv', '-c', action='store', dest='csv', help='CSV output file name')
parser.add_argument('--sort', '-s', action='store_true', dest='sort', default=False, help='Sort by decreasing ID')
parser.add_argument('--ascend', '-a', action='store_true', dest='ascend', default=False, help='Use ascending sort')
args = parser.parse_args(argv[1:])
inpfile = args.inpfile
sortem = args.sort
descSort = not args.ascend
csvFilename = args.csv if args.csv else 'tweets.csv'
# Load tweets from the input file, whether JSON or pickle.
ext = os.path.splitext(inpfile)[1].lower()
if ext == '.json' :
with codecs.open(inpfile, 'rb', 'utf-8') as inp :
tweets = json.load(inp)
elif ext == '.pickle' :
with open(inpfile, 'rb') as inp :
tweets = pickle.load(inp)
else :
print 'Unknown extension {}'.format(ext)
return 1
print '{}: {} tweets'.format(inpfile, len(tweets))
if sortem :
print 'Sorting tweets'
tweets.sort(key=lambda x: x['id'], reverse=descSort)
print 'Writing CSV file {}'.format(csvFilename)
re_crlf = re.compile("\n|\r+")
re_dquo = re.compile(r'"')
with codecs.open(csvFilename, mode='w', encoding='utf-8') as out :
out.write('%s,%s,%s,%s,%s,%s,%s\n' % ('Tweet ID', 'Screen Name', 'Name',
'Time-Stamp', 'Retweets', 'Favorites', 'Text'))
for tweet in tweets :
id = tweet['id_str']
when = tweet['created_at']
# User is a structure within a tweet that contains the poster's
# screen name and full name. Convert the name to a Unicode string,
# protect any internal double quotes, and enclose in double quotes
# to protect any internal commas.
user = tweet['user']
name = unicode(user['name'])
name = re_dquo.sub(r'""', name)
name = '"' + name + '"'
screen = user['screen_name']
# The retweet and favorite counts aren't always present, so
# they need a check and a default.
rts = tweet['retweet_count'] if 'retweet_count' in tweet else 0
favs = tweet['favorite_count'] if 'favorite_count' in tweet else 0
# Convert the text string to a Unicode string, protect any
# internal double quotes, remove any contained newlines,
# and enclose in double quotes to protect any internal commas.
text = unicode(tweet['text'])
text = re_dquo.sub(r'""', text)
text = re_crlf.sub(r' ', text)
text = '"' + text + '"'
out.write('%s,%s,%s,%s,%d,%d,%s\n' % (id, screen, name, when, rts, favs, text))
if __name__ == "__main__":
sys.exit(main())