-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlambda_function.py
More file actions
108 lines (87 loc) · 3.74 KB
/
lambda_function.py
File metadata and controls
108 lines (87 loc) · 3.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import requests
from bs4 import BeautifulSoup
import csv
import boto3
import os
from io import StringIO
from datetime import datetime
def get_book_category(book_url):
try:
response = requests.get(book_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
breadcrumb = soup.find('ul', class_='breadcrumb')
if breadcrumb:
links = breadcrumb.find_all('a')
if len(links) >= 3:
category = links[2].text.strip()
return category
return "general"
except:
return "general"
def lambda_handler(event, context):
books = []
base_url = "https://books.toscrape.com/catalogue/page-{}.html"
rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
for page in range(1, 50):
print(f"Scraping page {page}...")
url = base_url.format(page)
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
book_containers = soup.find_all('article', class_='product_pod')
for container in book_containers:
title_element = container.find('h3').find('a')
title = title_element.get('title')
book_url = 'https://books.toscrape.com/catalogue/' + title_element.get('href')
price_element = container.find('p', class_='price_color')
price_text = price_element.text.strip() if price_element else '£0.00'
price = float(price_text.replace('£', ''))
rating_element = container.find('p', class_='star-rating')
rating_text = rating_element.get('class')[1] if rating_element else None
rating = rating_map.get(rating_text, 0)
availability_element = container.find('p', class_='instock availability')
availability = availability_element.text.strip() if availability_element else 'Unknown'
img_tag = container.select('div.image_container a img')
if img_tag:
img_tag = img_tag[0]
img_src = img_tag['src']
img_src = img_src.replace('../','https://books.toscrape.com/catalogue/')
else:
''
category = get_book_category(book_url)
book = {
"id": len(books) + 1,
"title": title,
"price": price,
"rating": rating,
"availability": availability,
"category": category,
"image": img_src
}
books.append(book)
except requests.RequestException as e:
print(f"Error scraping page {page}: {e}")
continue
# Convert to CSV
csv_buffer = StringIO()
fieldnames = ['id', 'title', 'price', 'rating', 'availability', 'category','image']
writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(books)
# Upload to S3
s3 = boto3.client('s3')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
key = f"books_general_{timestamp}.csv"
bucket_name = os.environ.get('BUCKET_NAME', 'scrape-output')
s3.put_object(
Bucket=bucket_name,
Key=key,
Body=csv_buffer.getvalue(),
ContentType='text/csv'
)
return {
'statusCode': 200,
'body': f'Scraped {len(books)} books and saved to s3://{bucket_name}/{key}'
}