Skip to content

Commit

Permalink
initial
Browse files Browse the repository at this point in the history
  • Loading branch information
jcpeterson committed Feb 18, 2019
1 parent 171c126 commit 25cc5d2
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 0 deletions.
48 changes: 48 additions & 0 deletions extract_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os, sys, json, argparse

from utils import *

parser = argparse.ArgumentParser()
parser.add_argument('--psdir', type=str, default='pushshift_dumps')
parser.add_argument('--year_start', type=int, default=2018)
parser.add_argument('--year_end', type=int, default=2018)
parser.add_argument('--min_karma', type=int, default=3)
args = parser.parse_args()

filenames = []
years = range(args.year_start, args.year_end+1)
years = [str(year) for year in years]
for fn in os.listdir(args.psdir):
for year in years:
if year in fn:
filenames.append(fn)
filenames = sorted(filenames)
print('Processing the following files',
filenames)

good_links = []
for fn in filenames:

hit_count = 0

path = os.path.join(args.psdir, fn)
decompress = get_decompresser(fn)

with decompress(path, "r") as psfile:
with open(fn+'.goodlinks.txt', 'w') as outfile:

for line in psfile:
j = json.loads(line)

# only take the good links
if (not is_bad_url(j['url'])) and \
(j['score'] > args.min_karma-1) and \
(not j['over_18']):

outfile.write(j['url'] + '\n')

hit_count += 1
if hit_count % 10000==0:
print(hit_count)
outfile.flush()
os.system('xz -zkf '+fn+'.goodlinks.txt')
Binary file added pushshift_dumps/RS_v2_2005-06.xz
Binary file not shown.
Binary file added pushshift_dumps/RS_v2_2005-07.xz
Binary file not shown.
127 changes: 127 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import bz2
try: # python3
import lzma
except ImportError: # python2
from backports import lzma

def get_decompresser(fn):
if '.bz2' in fn:
decompress = bz2.BZ2File
elif '.xz' in fn:
decompress = lzma.open
return decompress

# the below is adapted from:
# https://github.com/eukaryote31/openwebtext/blob/master/filter.py

import tldextract

# domains that aren't scraper friendly. do not include subdomains!
exclude_domains = set([

# image & video hosting sites
'imgur.com',
'redd.it',
'gfycat.com',
'giphy.com',
'reddituploads.com',
'redditmedia.com',
'twimg.com',
'sli.mg',
'magaimg.net',
'flickr.com',
'imgflip.com',
'youtube.com',
'youtu.be',
'youtubedoubler.com',
'vimeo.com',
'twitch.tv',
'streamable.com',
'bandcamp.com',
'soundcloud.com',
'video.google.com',

# not scraper friendly
'reddit.com',
'gyazo.com',
'github.com',
'xkcd.com',
'twitter.com',
'spotify.com',
'itunes.apple.com',
'facebook.com',
'gunprime.com',
'strawpoll.me',
'voyagefusion.com',
'rollingstone.com',
'google.com',
'timeanddate.com',
'walmart.com',
'roanoke.com',
'spotrac.com',

# other non-text content
'ebay.com',

# remove these?
# 'reverb.com',

# original paper excluded wikipedia
'wikipedia.org',

# lots of top posts for this one
'battleforthenet.com',
])

exclude_extensions = (
'.png',
'.jpg',
'.jpeg',
'.gif',
'.gifv',
'.pdf',
'.mp4',
'.mp3',
'.ogv',
'.webm',
'.doc',
'.docx',
'.log',
'.csv',
'.dat',
'.iso',
'.bin',
'.exe',
'.apk',
'.jar',
'.app',
'.ppt',
'.pps',
'.pptx',
'.xml',
'.gz',
'.xz',
'.bz2',
'.tgz',
'.tar',
'.zip',
'.wma',
'.mov',
'.wmv',
'.3gp',
'.svg',
)

def is_bad_url(url):
ext = tldextract.extract(url)
domain = '.'.join([x for x in ext if x])
basedomain = '.'.join(ext[-2:])

if basedomain in exclude_domains or \
domain in exclude_domains:
return True

if url.split('?')[0].endswith(exclude_extensions):
return True

return False

0 comments on commit 25cc5d2

Please sign in to comment.