Skip to content

Commit 160f960

Browse files
committed
complete manning book scrap
1 parent 89e82d1 commit 160f960

10 files changed

Lines changed: 430 additions & 62 deletions

File tree

backend/booksbot/booksbot/pipelines.py

Lines changed: 68 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,86 @@
55

66

77
# useful for handling different item types with a single interface
8-
from itemadapter import ItemAdapter
8+
9+
from datetime import datetime
10+
11+
from booksbot.utils import extract_authors
12+
from sqlalchemy import select
13+
14+
from python_talk.app import create_app
15+
from python_talk.extensions import db
16+
from python_talk.models.book import Book, Author
917

1018

1119
# class BooksbotPipeline:
1220
# def process_item(self, item, spider):
1321
# return item
14-
from python_talk.models.book import Book
15-
from python_talk.extensions import db
22+
1623

1724
class BookPipeline:
1825
def process_item(self, item, spider):
1926
"""
2027
将采集到的数据存入数据库
2128
"""
22-
print(item)
23-
book = Book(**item)
24-
db.session.add(book)
25-
db.session.commit()
29+
# print(item)
30+
with self.app.app_context():
31+
32+
# 判断作者是否存在,存在就存储(暂不考虑是否为同一作者),不存在就新增
33+
author_name = item.get('authorshipDisplay')
34+
author_lst = extract_authors(author_name)
35+
36+
authors = []
37+
for name in author_lst:
38+
stmt = select(Author).where(Author.name == name)
39+
author = db.session.execute(stmt).scalar_one_or_none()
40+
41+
if not author:
42+
author = Author(name=name)
43+
db.session.add(author)
44+
db.session.flush()
45+
46+
authors.append(author)
47+
48+
isbn = item.get("isbn")
49+
50+
stmt = select(Book).where(Book.isbn == isbn)
51+
existing_book = db.session.execute(stmt).scalar_one_or_none()
52+
if not existing_book:
53+
book_data = {
54+
'title': item['title'],
55+
'isbn': item.get('isbn'),
56+
'price': item['price'],
57+
'url': item['link'],
58+
'publisher': item['publisher'],
59+
'publication_date': datetime.strptime(item['date'], "%Y-%m-%dT%H:%M:%S%z").date(),
60+
}
61+
book = Book(**book_data)
62+
book.authors = [author]
63+
64+
db.session.add(book)
65+
db.session.commit()
2666

2767
def open_spider(self, spider):
28-
print('spider 打开')
68+
# print('spider 打开')
69+
self.app = create_app()
2970

3071
def close_spider(self, spider):
31-
print('spider 关闭')
72+
# print('spider 关闭')
73+
with self.app.app_context():
74+
db.session.remove()
75+
76+
# if __name__ == '__main__':
77+
# app = create_app()
78+
# with app.app_context():
79+
# item = {
80+
# "title": "Generative AI in Action",
81+
# "author": "Alessandro Negro with Vlastimil Kus, Giuseppe Futia and Fabio Montagna<br><i>Forewords by Maxime Labonne, Khalifeh AlJadda<\u002fi>",
82+
# "price": 47.99,
83+
# "url": "https://www.manning.com/books/generative-ai-in-action",
84+
# "description": None,
85+
# "publisher": "manning",
86+
# 'isbn': '9781617291326',
87+
# 'date': "2020-11-29T00:00:00-0500",
88+
# }
89+
# pipeline = BookPipeline()
90+
# pipeline.process_item(item, spider=None)

backend/booksbot/booksbot/settings.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222
ROBOTSTXT_OBEY = True
2323

2424
# Concurrency and throttling settings
25-
#CONCURRENT_REQUESTS = 16
26-
CONCURRENT_REQUESTS_PER_DOMAIN = 1
27-
DOWNLOAD_DELAY = 1
25+
CONCURRENT_REQUESTS = 16
26+
CONCURRENT_REQUESTS_PER_DOMAIN = 16
27+
DOWNLOAD_DELAY = 0.25
2828

2929
# Disable cookies (enabled by default)
3030
#COOKIES_ENABLED = False
Lines changed: 76 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,100 @@
11
import json
2+
import re
23

34
import scrapy
4-
from twisted.web.xmlrpc import payloadTemplate
55

66

77
class ManningSpider(scrapy.Spider):
88
name = "manning"
99
allowed_domains = ['www.manning.com']
1010

11+
# 因为 headers 要给多个方法调用(如:start_requests(), parse()),所以声明未类变量
12+
headers = {
13+
'Content-Type': 'application/json',
14+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
15+
'(KHTML, like Gecko) Chrome/120.0 Safari/537.36',
16+
'Accept': 'application/json, text/plain, */*',
17+
'Origin': 'https://www.manning.com',
18+
'Referer': 'https://www.manning.com/',
19+
}
20+
21+
payload = {
22+
"accessType": [],
23+
# keywords: 书名里包含的关键字
24+
"keywords": [],
25+
"level": [],
26+
"meapFilter": "published",
27+
"productType": [
28+
"book"
29+
],
30+
"programmingLanguages": [
31+
"python"
32+
],
33+
"selectedCategoryIds": [
34+
1
35+
],
36+
"sort": "newest",
37+
"includePrices": True
38+
}
39+
1140
def start_requests(self):
1241
"""
1342
重写该方法以执行 POST 方法
1443
"""
15-
headers = {
16-
'Content-Type': 'application/json',
17-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
18-
'(KHTML, like Gecko) Chrome/120.0 Safari/537.36',
19-
'Accept': 'application/json, text/plain, */*',
20-
'Origin': 'https://www.manning.com',
21-
'Referer': 'https://www.manning.com/',
22-
}
23-
payload = {
24-
"accessType": [],
25-
# keywords: 书名里包含的关键字
26-
"keywords": [],
27-
"level": [],
28-
"meapFilter": "published",
29-
"productType": [
30-
"book"
31-
],
32-
"programmingLanguages": [
33-
"python"
34-
],
35-
"selectedCategoryIds": [
36-
1
37-
],
38-
"sort": "newest",
39-
"includePrices": True
40-
}
4144

4245
yield scrapy.Request(
4346
url='https://www.manning.com/search/getCatalogData',
4447
method='POST',
45-
headers=headers,
46-
body=json.dumps(payload),
47-
# cookies=cookies,
48+
headers=self.headers,
49+
body=json.dumps(self.payload),
4850
callback=self.parse,
49-
# meta={'page': self.start_page, 'payload': payload},
5051
)
5152

53+
def parse_detail(self, response):
54+
"""
55+
从详情页解析 ISBN 和 description(可选)
56+
"""
57+
import time
58+
self.logger.info(f"parse_detail start {time.time()}")
59+
book = response.meta['book']
60+
61+
# 获取 html 页面的 ISBN
62+
isbn_text = response.xpath(
63+
'//div[contains(@class,"product-meta")]'
64+
'//li[starts-with(normalize-space(),"ISBN")]/text()'
65+
).get()
66+
m = re.search(r'ISBN(?:-13)?:?\s*([0-9\-]{10,17})',isbn_text)
67+
if m:
68+
book['isbn'] = m.group(1).replace('-', '')
69+
70+
book['isbn'] = book.get('isbn') or None
71+
72+
yield book
73+
5274
def parse(self, response):
75+
"""
76+
response: 将执行 start_requests() 里面的 scrapy.Request()得到的响应 作为这里的 response
77+
"""
5378
data = response.json()
5479
books = data['products']
5580
for book in books:
56-
item_data = {
57-
'title': book['title'],
58-
'author': book['authorshipDisplay'],
59-
'price': book['price'],
60-
'url': book['link'],
61-
'description': None,
62-
'publisher': 'manning',
63-
}
64-
yield item_data
81+
book['publisher'] = 'manning'
82+
yield scrapy.Request(
83+
url=book['link'],
84+
callback=self.parse_detail,
85+
meta={'book': book},
86+
)
87+
88+
# 如果有下一页,且不是最后一页
89+
# and 的优先级高于 :=,所以前半部分用括号括起来
90+
if (pagination := data.get('pagination')) and pagination.get('hasNextPage'):
91+
payload = self.payload.copy()
92+
payload['page'] = pagination['page'] + 1
93+
94+
yield scrapy.Request(
95+
url='https://www.manning.com/search/getCatalogData',
96+
method='POST',
97+
headers=self.headers,
98+
body=json.dumps(payload),
99+
callback=self.parse,
100+
)

backend/booksbot/booksbot/utils.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
Manning 出版社作者格式类型:
3+
1.单个作者:"Taehun Kim"
4+
2.使用逗号分隔的多个作者:"Benjamin Tan Wei Hao, Shanoop Padmanabhan, and Varun Mallya"
5+
3.包含 HTML 标签的作者:"Constantin Gonciulea and Charlee Stefanski<br><i>Foreword by Heather Higgins<\u002fi>"
6+
4.使用 ...with...and...连接的作者:"Jungjun Hur and Younghee Song"
7+
5.包含附加信息的作者: "Edward Raff, Drew Farris and Stella Biderman for Booz Allen Hamilton"
8+
9+
"""
10+
import re
11+
from html import unescape
12+
13+
14+
def extract_authors(authorship_display: str):
15+
# 处理转义符,如 <\u002fi> 其实是 </i>
16+
s = unescape(authorship_display)
17+
18+
# 移除 HTML 标签
19+
s = re.sub(r'<[^>]+>', '', s)
20+
21+
# 移除 Forward
22+
s = re.sub(r'Forewords? by.*$', '', s, flags=re.IGNORECASE)
23+
24+
# 移除组织名
25+
s = re.sub(r'\s+for\s+.*$', '', s, flags=re.IGNORECASE)
26+
27+
# 移除普通分隔符
28+
s = s.replace(' and ', ', ')
29+
s = s.replace(' with ', ', ')
30+
31+
# 切割成列表
32+
authors = [a.strip() for a in s.split(',') if a.strip()]
33+
34+
return authors
35+
36+
37+
# if __name__ == '__main__':
38+
# data = [
39+
# "Reuven M. Lerner",
40+
# "Taehun Kim",
41+
# "Val Andrei Fajardo",
42+
# "Benjamin Tan Wei Hao, Shanoop Padmanabhan, and Varun Mallya",
43+
# "Luis G. Serrano",
44+
# "Noah Flynn",
45+
# "Aneev Kochakadan",
46+
# "Tomasz Lelek and Artur Skowroński",
47+
# "Roberto Infante",
48+
# "Sebastian Raschka",
49+
# "Luca Antiga, Eli Stevens, Howard Huang, Thomas Viehmann",
50+
# "Nicole Koenigstein<br><i>Foreword by Luis Serrano</i>",
51+
# "Wei-Meng Lee",
52+
# "Pekka Enberg",
53+
# "José Haro Peralta<br><i>Foreword by Dan Barahona</i>",
54+
# "Alessandro Negro with Vlastimil Kus, Giuseppe Futia and Fabio Montagna<br><i>Forewords by Maxime Labonne, Khalifeh AlJadda</i>",
55+
# "Will Kurt",
56+
# "Rush Shahani",
57+
# "Jungjun Hur and Younghee Song",
58+
# "Mariia Mykhailova",
59+
# "François Chollet and Matthew Watson",
60+
# "Justin Mitchel",
61+
# "Tyler Suard",
62+
# "Tomaž Bratanič and Oskar Hane<br><i>Foreword by Paco Nathan</i>",
63+
# "Ashish Ranjan Jha",
64+
# "Sebastian Raschka and Abhinav Kimothi",
65+
# "Edward Raff, Drew Farris and Stella Biderman for Booz Allen Hamilton",
66+
# "Emmanuel Maggiori",
67+
# "Abhinav Kimothi",
68+
# "Gianluigi Mucciolo",
69+
# "Gianluigi Mucciolo",
70+
# "Gianluigi Mucciolo",
71+
# "Gianluigi Mucciolo",
72+
# "Gianluigi Mucciolo",
73+
# "Vaibhav Verdhan<br><i>Foreword by Ravi Gopalakrishnan</i>",
74+
# "Constantin Gonciulea and Charlee Stefanski<br><i>Foreword by Heather Higgins</i>",
75+
# "Immanuel Trummer",
76+
# "Christopher Kardell and Mark Brouwer",
77+
# "Rob Reider and Alexander Michalka",
78+
# "Mona Khalil<br><i>Foreword by Barry McCardel</i>"
79+
# ]
80+
# for name in data:
81+
# result = extract_authors(name)
82+
# print(result)
83+
# print(unescape("Constantin Gonciulea and Charlee Stefanski<br><i>Foreword by Heather Higgins<\u002fi>"))

backend/booksbot/main.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@
33

44
if __name__ == '__main__':
55
print(1)
6-
print(2)
76
execute(['scrapy', 'crawl', 'manning'])
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""empty message
2+
3+
Revision ID: 5d1bca12fab8
4+
Revises: 8aa8c270e402
5+
Create Date: 2026-01-01 13:48:56.078170
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
from sqlalchemy.dialects import mysql
11+
12+
# revision identifiers, used by Alembic.
13+
revision = '5d1bca12fab8'
14+
down_revision = '8aa8c270e402'
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
# 之前 isbn 找不到允许为空,后面找到了设置不允许为空
21+
op.execute('''
22+
UPDATE book
23+
SET isbn = CONCAT('TEMP_', id)
24+
WHERE isbn IS NULL
25+
''')
26+
27+
# ### commands auto generated by Alembic - please adjust! ###
28+
with op.batch_alter_table('book', schema=None) as batch_op:
29+
batch_op.alter_column('isbn',
30+
existing_type=mysql.VARCHAR(length=128),
31+
nullable=False)
32+
33+
# ### end Alembic commands ###
34+
35+
36+
def downgrade():
37+
# ### commands auto generated by Alembic - please adjust! ###
38+
with op.batch_alter_table('book', schema=None) as batch_op:
39+
batch_op.alter_column('isbn',
40+
existing_type=mysql.VARCHAR(length=128),
41+
nullable=True)
42+
43+
# ### end Alembic commands ###

0 commit comments

Comments
 (0)