-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2stacks-get-forum-threads.py
73 lines (63 loc) · 4.13 KB
/
2stacks-get-forum-threads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import requests
import json
import re
import config
import helpers
# import logging
# logger = logging.getLogger()
# logger.setLevel(logging.INFO)
def lambda_handler(event, context):
for record in event['Records']:
callback_url = record['messageAttributes']['callback_url']['stringValue']
forum_id = record['messageAttributes']['forum_id']['stringValue']
wikidot_site = record['messageAttributes']['wikidot_site']['stringValue']
# logger.info('Fetching forum ' + forum_id + ' for ' + wikidot_site)
page_no = 1
data = {'c': forum_id, 'p': page_no, 'moduleName': 'forum/ForumViewCategoryModule'}
haystack = helpers.fetch(data, wikidot_site)
try:
threads = re.findall('(?:\n\t\t\t\t\t\t\t\t\t\t\t\t<a href="\/forum\/t-)([^\/]*)', haystack)
if wikidot_site == 'fondationscp': # SCP-FR
pages = re.findall('(?:<span class="pager-no">page 1 de )(\d*)', haystack) # This technically returns 2 indistinguishable objects because Wikidot.
elif wikidot_site == 'scp-wiki-de': # SCP-DE
pages = re.findall('(?:<span class="pager-no">Seite 1 von )(\d*)', haystack) # This technically returns 2 indistinguishable objects because Wikidot.
elif wikidot_site == 'scp-pl': # SCP-PL
pages = re.findall('(?:<span class="pager-no">strona 1 z )(\d*)', haystack) # This technically returns 2 indistinguishable objects because Wikidot.
elif wikidot_site == 'scp-pt-br': # SCP-PT
pages = re.findall('(?:<span class="pager-no">página 1 do )(\d*)', haystack) # This technically returns 2 indistinguishable objects because Wikidot.
elif wikidot_site == 'fondazionescp': # SCP-IT
pages = re.findall('(?:<span class="pager-no">pagina 1 di )(\d*)', haystack) # This technically returns 2 indistinguishable objects because Wikidot.
elif wikidot_site == 'scpko': # SCP-KO
pages = re.findall('(?:<span class="pager-no">페이지: 1 / )(\d*)', haystack) # This technically returns 2 indistinguishable objects because Wikidot.
else: # SCP-EN and English-speaking wikis (Some -INT sites didn't have this translated, like -RU, -UA, -CN...)
pages = re.findall('(?:<span class="pager-no">page 1 of )(\d*)', haystack) # This technically returns 2 indistinguishable objects because Wikidot.
# logger.info('There are ' + str(pages) + ' pages of threads to look through.')
except: # This only really fails on a deleted page.
# TODO Make scuttle handle this.
return False
payload = {"wd_forum_id": forum_id, "threads": threads}
output = json.dumps(payload)
# Send everything to SCUTTLE
headers = {"Authorization": "Bearer " + config.scuttle_token, "Content-Type": "application/json"}
r = requests.put(callback_url + '/2stacks/forum/threads', data=output, headers=headers)
if not pages: # The Pythonic™ way of checking if a list is empty.
return { 'job': 'complete' }
else:
for page_no in range(int(pages[0])):
page_no += 1
data = {'c': forum_id, 'p': page_no, 'moduleName': 'forum/ForumViewCategoryModule'}
haystack = helpers.fetch(data, wikidot_site)
try:
threads = re.findall('(?:\n\t\t\t\t\t\t\t\t\t\t\t\t<a href="\/forum\/t-)([^\/]*)', haystack)
except: # This only really fails on a deleted page.
# TODO Make scuttle handle this.
return False
payload = {"wd_forum_id": forum_id, "threads": threads}
output = json.dumps(payload)
# logger.info('Sending page ' + str(page_no) + ' to SCUTTLE')
# Send everything to SCUTTLE
headers = {"Authorization": "Bearer " + config.scuttle_token, "Content-Type": "application/json"}
r = requests.put(callback_url + '/2stacks/forum/threads', data=output, headers=headers)
return {
'job': 'complete'
}