-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlinkExtractor.py
38 lines (34 loc) · 1.44 KB
/
linkExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
import os
index = 0
writer = any
out = any
flag = 0
unsafe = [")", "<", ">","'","\"", "{", "}", "|", "\\", "^", "~", "[", "]", "`"]
# traverse directory, and list directories as dirs and files as files
for root, dirs, files in os.walk(os.getcwd()):
path = root.split(os.sep)
if ("sample-onboarding-resources" in root.lower()):
if (path[len(path)-1] == "sample-onboarding-resources"):
index = len(path) + 1
if len(path) == index:
# open the file in the write mode
print("Processing", root.lower())
out = open(root + "/Links.txt", 'w', encoding='utf-8',errors="ignore")
flag = 1
if(len(path) >= index):
for file in files:
if not ( "links" in file.lower() or "change" in file.lower()):
with open(root+"/"+file, encoding='utf-8',errors="ignore") as f:
for line in f:
links = re.findall('(?P<url>https?://[^\s]+)', line)
if flag == 1:
for link in links:
for ch in unsafe:
link = link.split(ch)[0]
out.write(link)
out.write("\n")
# close the file
if flag == 1 and len(path) == index-1:
out.close()
flag = 0