-
Notifications
You must be signed in to change notification settings - Fork 0
/
reddit2csv.py
132 lines (119 loc) · 5.59 KB
/
reddit2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
A quick and dirty script to parse Reddit dumps
(written using the pushshift 40k top subreddit 2005-2023 archive)
Reads in submission and comment jsonl files, scans for keywords,
then outputs select fields in CSV format.
Submissions and comments are in chronological order.
Finds and outputs matching submissions and all comments belonging
to matching submissions, ignores matching comments to non-matching
submissions. Most useful for structured analysis when the keywords
do a good job of identifying relevant submissions.
"""
REDDITS = ["breastcancer"]
KEYWORDS = ["flat", "aesthetic closure", "goldilocks", "goldilock"]
SUBMISSION_COLUMNS = ["subreddit","type","title","author","score","selftext","url","id","permalink","created_utc","date"]
COMMENT_COLUMNS = ["subreddit","type","author","score","body","id","parent_id","submission_id","permalink","created_utc","date"]
# Reddit API notes
# 'score' is the total score ('ups' - 'downs') of a post. 'ups' and
# downs' are deprecated - 'ups' is always the same as 'score' and
# 'downs' is always zero.
# 'controversality' is a boolean flag to show that a comment has a high,
# but roughly similar number of likes/dislikes
#
# Reddit API changes mean that some fields (e.g. created_utc) are
# different types over time and some (e.g. permalinks) may not exist
# at all.
# Reddit Thing Fullnames (from the Reddit API docs)
# "A fullname is a combination of a thing's type (e.g. Link) and its unique ID which forms a compact encoding of a globally unique ID on reddit.
# Fullnames start with the type prefix for the object's type, followed by the thing's unique ID in base 36. For example, t3_15bfi0."
# type prefixes
# t1_ Comment
# t2_ Account
# t3_ Link (and submissions??)
# t4_ Message
# t5_ Subreddit
# t6_ Award
SAMPLE = False
import json
import csv
from datetime import datetime
import re, string
def get_submission(j, comments):
parent_ids = []
while 1:
if j["parent_id"][0:3] == "t3_":
return j["parent_id"]
parent_ids.append(j["parent_id"])
if j["parent_id"] in comments.keys():
j = comments[j["parent_id"]]
else:
return None
cleanup_pattern = re.compile(r'[\W_]+')
submission_ids = {}
for reddit in REDDITS:
if SAMPLE:
submissions_filename = reddit+"_submissions_sample"
else:
submissions_filename = reddit+"_submissions"
with open(submissions_filename,"r", encoding="UTF-8") as submissions_file:
submissions = []
for line in submissions_file:
j = json.loads(line)
title = " "+cleanup_pattern.sub(' ', j["title"].lower())+" "
selftext = " "+cleanup_pattern.sub(' ', j["selftext"].lower())+" "
if any([(" "+keyword+" " in selftext or " "+keyword+" " in title) for keyword in KEYWORDS]):
submission = {}
submission["subreddit"] = j["subreddit"]
submission["type"] = "submission"
submission["title"] = j["title"]
submission["author"] = j["author"]
submission["score"] = j["score"]
submission["selftext"] = j["selftext"]
submission["url"] = j["url"]
submission["id"] = "t3_"+j["id"]
submission["permalink"] = j["permalink"]
submission["created_utc"] = j["created_utc"]
submission["date"] = datetime.fromtimestamp(int(j["created_utc"])).strftime('%Y-%m-%d')
submissions.append(submission)
continue
submission_ids[reddit] = [submission["id"] for submission in submissions]
with open("./output/"+reddit+"_submissions"+".csv","w",encoding="UTF-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=SUBMISSION_COLUMNS)
writer.writeheader()
for submission in submissions:
writer.writerow(submission)
if SAMPLE:
comments_filename = reddit+"_comments_sample"
else:
comments_filename = reddit+"_comments"
with open(comments_filename,"r", encoding="UTF-8") as comments_file:
comments = {}
for line in comments_file:
j = json.loads(line)
body = " "+cleanup_pattern.sub(' ', j["body"].lower())+" "
# This only works because the comments are in chronological order
# and children cannot come before parents!
if j["parent_id"] in submission_ids[reddit] or j["parent_id"] in comments.keys():
comment = {}
comment["subreddit"] = j["subreddit"]
comment["type"] = "comment"
comment["author"] = j["author"]
comment["score"] = j["score"]
comment["body"] = j["body"]
comment["id"] = "t1_"+j["id"]
comment["parent_id"] = j["parent_id"]
comment["submission_id"] = get_submission(j,comments)
if "permalink" in j:
comment["permalink"] = j["permalink"]
else:
# sometimes (maybe because of older API versions) there aren't permalinks
comment["permalink"] = "NONE"
comment["created_utc"] = j["created_utc"]
comment["date"] = datetime.fromtimestamp(int(j["created_utc"])).strftime('%Y-%m-%d')
comments[comment["id"]] = comment
continue
with open("./output/"+reddit+"_comments"+".csv","w",encoding="UTF-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=COMMENT_COLUMNS)
writer.writeheader()
for comment in comments:
writer.writerow(comments[comment])