-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrap_irasutoya.py
181 lines (140 loc) · 5.16 KB
/
scrap_irasutoya.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from bs4 import BeautifulSoup as bs
import requests
import json
import re
from tqdm import tqdm
# Define the URL to scrape
url_de_base = "https://www.irasutoya.com/"
def soup_creation(url):
"""
Returns the BeautifulSoup analysis of an HTML page (its soup)
Args:
url (str): Link to the page to be scraped
Returns:
soup : Soup of the scraped page
"""
# Download the page
response = requests.get(url)
# Get the HTML of the downloaded response
html = response.content
# Analyze the HTML with "lxml" lexical and grammar analyzer
return bs(html, "lxml")
def get_main_page_all_links(soup):
"""
Analyzes the main page of the site and retrieves all available theme links
Args:
soup (html): Soup of the scraped page
Returns:
list : List of all scraped links on the page
"""
links = soup.find_all("div", id="section_banner")
lst_of_links = []
for link in links:
for link_of_link in link.find_all('a'):
lst_of_links.append(link_of_link.get('href'))
return lst_of_links
def get_sub_page_all_links(soup):
"""
Analyzes the sub page of the site and retrieves all available sub-theme links
Args:
soup (html): Soup of the scraped page
Returns:
list : List of all scraped links on the sub-page
"""
links = soup.find_all("div", id="banners")
lst_of_links = []
for link in links:
for link_of_link in link.find_all('a'):
lst_of_links.append(link_of_link.get('href'))
return lst_of_links
def next_page(soup):
"""
Function which allows to get the link to the next page if it exists
Args:
soup (html): Soup of the scraped page
Returns:
str or None : String of the link to the next page if it exists
"""
try:
link_next_page = soup.find('div', id='page_link').find_all("a")[-2].get('href')
return link_next_page
except:
return None
def recup_data(soup, file_name):
"""
Collecting useful data and creating a dictionary to handle them
Args:
soup (html): Soup of the scraped page
Returns:
dict_of_data : dictionary of the scraped data
"""
all_data = soup.find_all('div', class_='boxim')
for data in tqdm(all_data, desc="Extracting data"):
script_content = data.find('a').script
# Using regular expressions to extract the link and text
match = re.search(r'bp_thumbnail_resize\("(.*?)","(.*?)"\)', script_content.string)
if match:
image_link = match.group(1)
image_text = match.group(2).split('&')[0].split('のイラスト')[0]
name_key = image_link.split('/')[-1].split('.')[0]
if image_link and image_text:
dic = { image_text :
{
'img' : image_link,
'description' : image_text
}
}
append_to_json(dic, file_name)
def append_to_json(data_to_append, json_file_path):
"""
Ajoute des données à un fichier JSON existant.
Args:
- data_to_append (dict): Les données à ajouter au fichier JSON.
- json_file_path (str): Le chemin vers le fichier JSON existant.
"""
# Écrit les données mises à jour dans le fichier JSON
with open(json_file_path, 'a+') as json_file:
json.dump(data_to_append, json_file, indent=4, ensure_ascii=False)
def scrap_page(url, file_name):
"""
This function scrapes the given URL and saves the data in a JSON file.
Parameters:
url (str): The URL of the page to scrape.
file_name (str): The name of the JSON file to save the data in.
Returns:
None
"""
# Create soup for the current page
actual_page = soup_creation(url)
# Scrape the current page
recup_data(actual_page, file_name)
# Get the next page to analyze if it exists
next_page_url = next_page(actual_page)
# Recursion of the function if the next page exists
if next_page_url is not None:
scrap_page(next_page_url, file_name)
def main(url_de_base, file_name):
'''
Collects all links to sub-pages, then retrieves images + descriptions from all sub-sub-pages,
then navigates between them until the last one before reiterating the process
Args:
file_name (str): Raw filename without extension
data (list): List of links
'''
# Create soup for the current page
main_page = soup_creation(url_de_base)
# Retrieve all desired links from the current page
links_theme = get_main_page_all_links(main_page)
for part_of_link in links_theme:
if part_of_link.startswith("/p/"):
try :
# Create soup for the theme page
page_theme = soup_creation(url_de_base + part_of_link)
links_sub_theme = get_sub_page_all_links(page_theme)
for sub_link in links_sub_theme:
# Create soup for the sub-theme page
scrap_page(sub_link, file_name)
except:
continue
if __name__ == '__main__':
main(url_de_base, 'data_collection/irasutoya_kana.json')