-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.py
202 lines (156 loc) · 6.96 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from bs4 import BeautifulSoup, NavigableString
from requests import get
from sys import argv
from queue import Queue
from threading import Thread
from time import time
from sys import stdout
base = "https://en.wikipedia.org"
midpoint = base + "/wiki/United_States"
prefixes = ["Template", "Template_talk", "Template talk", "Talk", "User", "User talk", "Wikipedia", "Wikipedia talk", "File", "File talk", "MediaWiki", "MediaWiki talk", "Template", "Template talk", "Help", "Help talk", "Category", "Category talk", "Portal", "Portal talk", "Draft", "Draft talk", "TimedText", "TimedText talk", "Module", "Module talk", "Book", "Book talk", "Education Program", "Education Program talk", "Gadget", "Gadget talk", "Gadget definition", "Gadget definition talk", "Special", "Media"]
max_links = 50
midpoint_links = []
def search(start, end):
def url_input(text):
if "/wiki/" not in text:
text = "/wiki/" + text
if "wikipedia.org" not in text.lower():
text = base + text
return text
def validate(href):
if href and len(href) > 6 and href[:6] == "/wiki/":
href = href.split("#")[0]
if ":" in href:
prefix = href.split("/wiki/")[1].split(":")[0]
if prefix in prefixes:
return False
return href
return False
def forward_tree(urls, url, done, checked, end, result, midpoint_links):
response = get(url, allow_redirects=True)
title = response.url.lower().split("/wiki/")[1].split("#")[0]
if title == end:
return done(urls)
html = response.text
dom = BeautifulSoup(html, features="html.parser")
div = dom.find("div", {"class": "mw-parser-output"})
if not div: return
links = div.find_all("a")
parsed_links = []
for l in links:
href = validate(l.get("href"))
if not href: continue
total = urls + [(l.contents[0], base + href)]
if href.lower().split("/wiki/")[1] == end:
return done(total)
if total[-1][1] not in checked:
parsed_links.append(total)
return (title, len(response.text), parsed_links)
def backward_tree(urls, url, done, checked, end, result, midpoint_links):
title = url.split("/wiki/")[1]
wlh = f"https://en.wikipedia.org/w/index.php?title=Special:WhatLinksHere/{title}&namespace=0&limit=2000&hideredirs=1&hidetrans=1"
response = get(wlh, allow_redirects=True)
html = response.text
dom = BeautifulSoup(html, features="html.parser")
parsed_links = []
ul = dom.find("ul", {"id": "mw-whatlinkshere-list"})
if not ul: return
for li in ul.contents:
if isinstance(li, NavigableString): continue
if not li.contents: continue
href = validate(li.contents[0].get("href"))
if not href: continue
total = urls + [(str(li.contents[0].contents[0]), base + href)]
found_title = href.lower().split("/wiki/")[1]
if found_title == end:
return done(total)
elif found_title in midpoint_links:
total += [midpoint]
return done(total)
if total[-1][1] not in checked:
parsed_links.append(total)
return (title, len(response.text), parsed_links)
def subsearch(start, end, result, tree, midpoint_links = set()):
end = end.lower().split("/wiki/")[1]
end_words = set(end.split("_"))
checked = set()
unsearched = [[("", start)]]
nodes = {}
threads = []
def done(urls):
result.append(urls)
def score_page(title, length, links):
scores = [
(1, length / 1e4),
(10, 2*len(links)),
(100, 100 if len(set(title.split("_")) & end_words) > 0 else 0),
]
return sum([weight * score for weight, score in scores])
def score_link(urls):
url = urls[-1][1]
title = url.split("/wiki/")[1]
scores = [
(3, 100 - (0.01 * (len(title) ** 2))),
(1, 100 if len(title) > 8 and title[:8] == "list_of_" else 0),
(100, 100 if len(set(title.split("_")) & end_words) > 0 else 0),
]
return sum([weight * score for weight, score in scores])
def fetch(urls, url, result, midpoint_links):
ret = tree(urls, url, done, checked, end, result, midpoint_links)
if not ret:
return
title, length, parsed_links = ret
parsed_links.sort(key=score_link, reverse=True)
if len(parsed_links) > max_links:
parsed_links = parsed_links[:max_links]
nodes[url] = (score_page(title, length, parsed_links), parsed_links)
print(f"SEARCHED {title}")
while not result:
if len(unsearched) == 0:
for thr in threads:
if thr.is_alive():
thr.join()
if result:
break
best_node = max(nodes, key=lambda k: nodes[k][0])
print(f"\nSELECTED NEXT BEST NODE: {best_node}\n")
unsearched = nodes[best_node][1]
nodes = {}
urls = unsearched.pop(0)
url = urls[-1][1]
if url in checked: continue
checked.add(url)
try:
thr = Thread(target=fetch, daemon=True, args=(urls, url, result, midpoint_links))
thr.start()
threads.append(thr)
except RuntimeError:
pass
for thr in threads:
if thr.is_alive():
thr.join()
print(f"\nCOMPLETED SUBSEARCH FOR MIDPOINT PATH\n")
print()
start_time = time()
_, __, midpoint_links = forward_tree([], midpoint, (lambda x:None), set(), "", [], set())
midpoint_links = {(x[0][0], x[0][1].split("/wiki/")[1].lower()) for x in midpoint_links}
result1, result2 = [], []
subsearch(url_input(start), midpoint, result1, forward_tree)
subsearch(url_input(end), midpoint, result2, backward_tree, midpoint_links)
result = result1[0][:-1] + result2[0][::-1]
end_time = time()
elapsed_time = round((end_time - start_time) * 100.0) / 100.0
def print_result(file = stdout):
print(*[f"{r[0]}\n >> {r[1]}" for r in result], sep="\n", file=file)
with open("result.txt", "w+") as file:
print_result(file = file)
print(f"RESULT in {elapsed_time}s:\n")
print_result()
print("\nSAVED TO \"result.txt\"\n")
#input("Press ENTER to close...")
###########################################
if len(argv) > 2:
start, end = argv[1], argv[2]
else:
start, end = input("Paste the start URL: "), input("Paste the end URL: ")
search(start, end)