|
1 |
| -We scrape the images |
| 1 | +# We scrape the images |
2 | 2 |
|
3 | 3 | # https://www.latostadora.com/afaces
|
| 4 | + |
| 5 | +import requests |
| 6 | +from bs4 import BeautifulSoup |
| 7 | + |
| 8 | +playlists = ["https://www.latostadora.com/afaces"] |
| 9 | +playlistName = "" |
| 10 | + |
| 11 | +url = playlists[0] |
| 12 | +res = requests.get(url) |
| 13 | +html_page = res.content |
| 14 | + |
| 15 | +soup = BeautifulSoup(html_page, 'html.parser') |
| 16 | +#text = soup.find_all(text=True) |
| 17 | +images = soup.find_all('img') |
| 18 | + |
| 19 | +output = '' |
| 20 | +blacklist = [ |
| 21 | + '[document]', |
| 22 | + 'noscript', |
| 23 | + 'header', |
| 24 | + 'html', |
| 25 | + 'meta', |
| 26 | + 'head', |
| 27 | + 'input', |
| 28 | + 'script', |
| 29 | + # there may be more elements you don't want, such as "style", etc. |
| 30 | +] |
| 31 | +for t in images: |
| 32 | + if t.parent.name not in blacklist: |
| 33 | + output += '{} '.format(t) |
| 34 | + |
| 35 | +lines = output.split('data-original=') |
| 36 | + |
| 37 | +# print(lines[1].split(" ")[0]) |
| 38 | +# print(lines[2].split(" ")[0]) |
| 39 | + |
| 40 | +text = "" |
| 41 | +for i in lines: |
| 42 | + text += i |
| 43 | +ola = text.split("zoomable-images") |
| 44 | + |
| 45 | +c = 1 |
| 46 | +images_url = [] |
| 47 | +for j in ola: |
| 48 | + images_url.append(ola[c].split(" ")[1].strip("\"")) |
| 49 | + c+=1 |
| 50 | + if c >= len(ola): |
| 51 | + break |
| 52 | + |
| 53 | +# Get URL of each product |
| 54 | +project_href = [i.a['href'] for i in soup.find_all('div', attrs={'class': 'm-product-card'})] |
| 55 | +for i in range(len(project_href)): |
| 56 | + project_href[i] = "https://www.latostadora.com" + project_href[i] |
| 57 | + |
| 58 | +urlIDList = [] |
| 59 | +for j in range(len(images_url)): |
| 60 | + separateList=[] |
| 61 | + separateList.append(images_url[j]) |
| 62 | + separateList.append(project_href[j]) |
| 63 | + urlIDList.append(separateList) |
| 64 | + |
| 65 | +# print(urlIDList[0]) |
| 66 | +with open("store.md", "w") as o: |
| 67 | + for k in range(len(urlIDList)): |
| 68 | + o.writelines("[](" + urlIDList[k][1] + ")\n") |
0 commit comments