-
Notifications
You must be signed in to change notification settings - Fork 80
/
Copy pathfetch_urls.py
39 lines (33 loc) · 1.11 KB
/
fetch_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
"""Tool for fetching URLs from pushshift.io."""
# Import dependencies
import os
from urllib import request as req
import re
import pycurl
# Define values
BASE_URL = "https://files.pushshift.io/reddit/submissions" # No trailing slash
LINK_RE_PATTERN = r"<a\s.*href=[\"'](\S+)[\"'][^>]*>\S*<\/a>"
OUTPUT_DIR = "pushshift_dumps_full"
# Define functions
def main():
"""The main entrypoint."""
# Get links
link_re = re.compile(LINK_RE_PATTERN)
raw_links = link_re.findall(req.urlopen(BASE_URL).read().decode("utf-8"))
filtered_links = [link for link in raw_links if link.startswith("./")]
individual_links = list(set(filtered_links))
# Download files
curl = pycurl.Curl()
os.makedirs(OUTPUT_DIR)
for link in individual_links:
filename = link[2:]
url = BASE_URL + "/" + filename
with open(os.path.join(OUTPUT_DIR, filename), "wb") as file:
curl.setopt(curl.URL, url)
curl.setopt(curl.WRITEDATA, file)
curl.perform()
print("Downloaded", filename)
curl.close()
# Execute main function
if __name__ == "__main__":
main()