initial

jcpeterson · Feb 18, 2019 · 25cc5d2 · 25cc5d2
1 parent 171c126
commit 25cc5d2
Show file tree

Hide file tree

Showing 4 changed files with 175 additions and 0 deletions.
diff --git a/extract_urls.py b/extract_urls.py
@@ -0,0 +1,48 @@
+import os, sys, json, argparse
+
+from utils import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--psdir', type=str, default='pushshift_dumps')
+parser.add_argument('--year_start', type=int, default=2018)
+parser.add_argument('--year_end', type=int, default=2018)
+parser.add_argument('--min_karma', type=int, default=3)
+args = parser.parse_args()
+
+filenames = []
+years = range(args.year_start, args.year_end+1)
+years = [str(year) for year in years]
+for fn in os.listdir(args.psdir):
+    for year in years:
+        if year in fn: 
+            filenames.append(fn)
+filenames = sorted(filenames)
+print('Processing the following files', 
+      filenames)
+
+good_links = []
+for fn in filenames:
+
+    hit_count = 0
+
+    path = os.path.join(args.psdir, fn)
+    decompress = get_decompresser(fn)
+
+    with decompress(path, "r") as psfile:
+        with open(fn+'.goodlinks.txt', 'w') as outfile:
+
+            for line in psfile:
+                j = json.loads(line)
+
+                # only take the good links
+                if (not is_bad_url(j['url'])) and \
+                   (j['score'] > args.min_karma-1) and \
+                   (not j['over_18']):
+
+                    outfile.write(j['url'] + '\n')
+
+                    hit_count += 1
+                    if hit_count % 10000==0:
+                        print(hit_count)
+                        outfile.flush()
+                        os.system('xz -zkf '+fn+'.goodlinks.txt')
diff --git a/pushshift_dumps/RS_v2_2005-06.xz b/pushshift_dumps/RS_v2_2005-06.xz
diff --git a/pushshift_dumps/RS_v2_2005-07.xz b/pushshift_dumps/RS_v2_2005-07.xz
diff --git a/utils.py b/utils.py
@@ -0,0 +1,127 @@
+import bz2
+try:                # python3
+    import lzma
+except ImportError: # python2
+    from backports import lzma
+
+def get_decompresser(fn):
+    if '.bz2' in fn:
+        decompress = bz2.BZ2File
+    elif '.xz' in fn:
+        decompress =  lzma.open
+    return decompress
+
+# the below is adapted from:
+# https://github.com/eukaryote31/openwebtext/blob/master/filter.py
+
+import tldextract
+
+# domains that aren't scraper friendly. do not include subdomains!
+exclude_domains = set([
+
+    # image & video hosting sites
+    'imgur.com',
+    'redd.it',
+    'gfycat.com',
+    'giphy.com',
+    'reddituploads.com',
+    'redditmedia.com',
+    'twimg.com',
+    'sli.mg',
+    'magaimg.net',
+    'flickr.com',
+    'imgflip.com',
+    'youtube.com',
+    'youtu.be',
+    'youtubedoubler.com',
+    'vimeo.com',
+    'twitch.tv',
+    'streamable.com',
+    'bandcamp.com',
+    'soundcloud.com',
+    'video.google.com',
+
+    # not scraper friendly
+    'reddit.com',
+    'gyazo.com',
+    'github.com',
+    'xkcd.com',
+    'twitter.com',
+    'spotify.com',
+    'itunes.apple.com',
+    'facebook.com',
+    'gunprime.com',
+    'strawpoll.me',
+    'voyagefusion.com',
+    'rollingstone.com',
+    'google.com',
+    'timeanddate.com',
+    'walmart.com',
+    'roanoke.com',
+    'spotrac.com',
+
+    # other non-text content
+    'ebay.com',
+
+    # remove these?
+    # 'reverb.com',
+
+    # original paper excluded wikipedia
+    'wikipedia.org',
+
+    # lots of top posts for this one
+    'battleforthenet.com',
+])
+
+exclude_extensions = (
+    '.png',
+    '.jpg',
+    '.jpeg',
+    '.gif',
+    '.gifv',
+    '.pdf',
+    '.mp4',
+    '.mp3',
+    '.ogv',
+    '.webm',
+    '.doc',
+    '.docx',
+    '.log',
+    '.csv',
+    '.dat',
+    '.iso',
+    '.bin',
+    '.exe',
+    '.apk',
+    '.jar',
+    '.app',
+    '.ppt',
+    '.pps',
+    '.pptx',
+    '.xml',
+    '.gz',
+    '.xz',
+    '.bz2',
+    '.tgz',
+    '.tar',
+    '.zip',
+    '.wma',
+    '.mov',
+    '.wmv',
+    '.3gp',
+    '.svg',
+)
+
+def is_bad_url(url):
+    ext = tldextract.extract(url)
+    domain = '.'.join([x for x in ext if x])
+    basedomain = '.'.join(ext[-2:])
+
+    if basedomain in exclude_domains or \
+       domain in exclude_domains:
+        return True
+
+    if url.split('?')[0].endswith(exclude_extensions):
+        return True
+
+    return False