-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_me_workshop.py
53 lines (40 loc) · 2.32 KB
/
scrape_me_workshop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
## This bit of code extracts texts from the "workshop" section of instructables.
## it is still very rough, but it serves the purpouse. I will update it if I will need some new data in the future.
## So take this tool with a grain of salt, for the moment....
# This creates all the offests (i.e: the identifier of the urls that we want) for the workshop section of Instructables
#
# at present there are 59 pages in the section.
and_counting=59
instructs_urls_offsets= []
# the number 33689, can be found in the url of last page of the workshop section.
# NB: the more pages the bigger the number: keep it under control from time to time
# Yes, the best way to do this is to create a bot that checks the number every week and updates the db. Lo faccio domani :-)
from bs4 import BeautifulSoup
import requests
while and_counting <33689:
instructs_urls_offsets.append(and_counting)
and_counting+= 59
# This creates all the urls
a = "http://www.instructables.com/explore/category/workshop/?offset="
all_the_urls = [a+str(i) for i in instructs_urls_offsets]
all_the_urls.append("http://www.instructables.com/explore/category/workshop/")
# This extracts the links from all the pages listed in the a section.
# The links are not complete urls, but by adding the string "http://www.instructables.com"
# we are going to get exactly what we need
def extract_links_from_urls(list_of_links):
links_bucket = []
for i in list_of_links:
raw_soup = BeautifulSoup(requests.get(i).text, "lxml")
links_bucket.append(list(set([str(i["href"]) for i in raw_soup.find_all("a") if "/id/" in str(i)])))
print("number of links in the bucket: " , len(links_bucket))
return links_bucket
# This is going to take some time
all_the_links = extract_links_from_urls(all_the_urls)
# This processes the raw html and get rid unwanted http links
# it doesn't take care of www links
def get_the_juice(url):
req = requests.Session().get("http://www.instructables.com"+url)
soup = BeautifulSoup(req.content, "lxml")
#OLD: return [item.get_text() for item in soup.select("p")]
# Questa espressione regolare aggiunta al vecchio comando (vedi sopra) elimina anche le url che vengono usate come testo degli oggetti >a>
return [re.sub(r'https?:\/\/.*[\n\r\s]?','',item.get_text()) for item in soup.select("p")]