codists
diff --git a/‎backend/booksbot/booksbot/pipelines.py‎
Lines changed: 68 additions & 9 deletions b/‎backend/booksbot/booksbot/pipelines.py‎
Lines changed: 68 additions & 9 deletions
diff --git a/‎backend/booksbot/booksbot/settings.py‎
Lines changed: 3 additions & 3 deletions b/‎backend/booksbot/booksbot/settings.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backend/booksbot/booksbot/spiders/manning_spider.py‎
Lines changed: 76 additions & 40 deletions b/‎backend/booksbot/booksbot/spiders/manning_spider.py‎
Lines changed: 76 additions & 40 deletions
diff --git a/‎backend/booksbot/booksbot/utils.py‎
Lines changed: 83 additions & 0 deletions b/‎backend/booksbot/booksbot/utils.py‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎backend/booksbot/main.py‎
Lines changed: 0 additions & 1 deletion b/‎backend/booksbot/main.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backend/migrations/versions/5d1bca12fab8_.py‎
Lines changed: 43 additions & 0 deletions b/‎backend/migrations/versions/5d1bca12fab8_.py‎
Lines changed: 43 additions & 0 deletions
@@ -5,27 +5,86 @@
 
 
 # useful for handling different item types with a single interface
-from itemadapter import ItemAdapter
+
+from datetime import datetime
+
+from booksbot.utils import extract_authors
+from sqlalchemy import select
+
+from python_talk.app import create_app
+from python_talk.extensions import db
+from python_talk.models.book import Book, Author
 
 
 # class BooksbotPipeline:
 #     def process_item(self, item, spider):
 #         return item
-from python_talk.models.book import Book
-from python_talk.extensions import db
+
 
 class BookPipeline:
     def process_item(self, item, spider):
         """
         将采集到的数据存入数据库
         """
-        print(item)
-        book = Book(**item)
-        db.session.add(book)
-        db.session.commit()
+        # print(item)
+        with self.app.app_context():
+
+            # 判断作者是否存在,存在就存储(暂不考虑是否为同一作者)，不存在就新增
+            author_name = item.get('authorshipDisplay')
+            author_lst = extract_authors(author_name)
+
+            authors = []
+            for name in author_lst:
+                stmt = select(Author).where(Author.name == name)
+                author = db.session.execute(stmt).scalar_one_or_none()
+
+                if not author:
+                    author = Author(name=name)
+                    db.session.add(author)
+                    db.session.flush()
+
+                authors.append(author)
+
+            isbn = item.get("isbn")
+
+            stmt = select(Book).where(Book.isbn == isbn)
+            existing_book = db.session.execute(stmt).scalar_one_or_none()
+            if not existing_book:
+                book_data = {
+                    'title': item['title'],
+                    'isbn': item.get('isbn'),
+                    'price': item['price'],
+                    'url': item['link'],
+                    'publisher': item['publisher'],
+                    'publication_date': datetime.strptime(item['date'], "%Y-%m-%dT%H:%M:%S%z").date(),
+                }
+                book = Book(**book_data)
+                book.authors = [author]
+
+                db.session.add(book)
+                db.session.commit()
 
     def open_spider(self, spider):
-        print('spider 打开')
+        # print('spider 打开')
+        self.app = create_app()
 
     def close_spider(self, spider):
-        print('spider 关闭')
+        # print('spider 关闭')
+        with self.app.app_context():
+            db.session.remove()
+
+# if __name__ == '__main__':
+#     app = create_app()
+#     with app.app_context():
+#         item = {
+#             "title": "Generative AI in Action",
+#             "author": "Alessandro Negro with Vlastimil Kus, Giuseppe Futia and Fabio Montagna<br><i>Forewords by Maxime Labonne, Khalifeh AlJadda<\u002fi>",
+#             "price": 47.99,
+#             "url": "https://www.manning.com/books/generative-ai-in-action",
+#             "description": None,
+#             "publisher": "manning",
+#             'isbn': '9781617291326',
+#             'date': "2020-11-29T00:00:00-0500",
+#         }
+#         pipeline = BookPipeline()
+#         pipeline.process_item(item, spider=None)
@@ -22,9 +22,9 @@
 ROBOTSTXT_OBEY = True
 
 # Concurrency and throttling settings
-#CONCURRENT_REQUESTS = 16
-CONCURRENT_REQUESTS_PER_DOMAIN = 1
-DOWNLOAD_DELAY = 1
+CONCURRENT_REQUESTS = 16
+CONCURRENT_REQUESTS_PER_DOMAIN = 16
+DOWNLOAD_DELAY = 0.25
 
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
 
@@ -1,64 +1,100 @@
 import json
+import re
 
 import scrapy
-from twisted.web.xmlrpc import payloadTemplate
 
 
 class ManningSpider(scrapy.Spider):
     name = "manning"
     allowed_domains = ['www.manning.com']
 
+    # 因为 headers 要给多个方法调用(如：start_requests(), parse())，所以声明未类变量
+    headers = {
+        'Content-Type': 'application/json',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+                      '(KHTML, like Gecko) Chrome/120.0 Safari/537.36',
+        'Accept': 'application/json, text/plain, */*',
+        'Origin': 'https://www.manning.com',
+        'Referer': 'https://www.manning.com/',
+    }
+
+    payload = {
+        "accessType": [],
+        # keywords: 书名里包含的关键字
+        "keywords": [],
+        "level": [],
+        "meapFilter": "published",
+        "productType": [
+            "book"
+        ],
+        "programmingLanguages": [
+            "python"
+        ],
+        "selectedCategoryIds": [
+            1
+        ],
+        "sort": "newest",
+        "includePrices": True
+    }
+
     def start_requests(self):
         """
         重写该方法以执行 POST 方法
         """
-        headers = {
-            'Content-Type': 'application/json',
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
-                          '(KHTML, like Gecko) Chrome/120.0 Safari/537.36',
-            'Accept': 'application/json, text/plain, */*',
-            'Origin': 'https://www.manning.com',
-            'Referer': 'https://www.manning.com/',
-        }
-        payload = {
-            "accessType": [],
-            # keywords: 书名里包含的关键字
-            "keywords": [],
-            "level": [],
-            "meapFilter": "published",
-            "productType": [
-                "book"
-            ],
-            "programmingLanguages": [
-                "python"
-            ],
-            "selectedCategoryIds": [
-                1
-            ],
-            "sort": "newest",
-            "includePrices": True
-        }
 
         yield scrapy.Request(
             url='https://www.manning.com/search/getCatalogData',
             method='POST',
-            headers=headers,
-            body=json.dumps(payload),
-            # cookies=cookies,
+            headers=self.headers,
+            body=json.dumps(self.payload),
             callback=self.parse,
-            # meta={'page': self.start_page, 'payload': payload},
         )
 
+    def parse_detail(self, response):
+        """
+        从详情页解析 ISBN 和 description（可选）
+        """
+        import time
+        self.logger.info(f"parse_detail start {time.time()}")
+        book = response.meta['book']
+
+        # 获取 html 页面的 ISBN
+        isbn_text = response.xpath(
+            '//div[contains(@class,"product-meta")]'
+            '//li[starts-with(normalize-space(),"ISBN")]/text()'
+        ).get()
+        m = re.search(r'ISBN(?:-13)?:?\s*([0-9\-]{10,17})',isbn_text)
+        if m:
+            book['isbn'] = m.group(1).replace('-', '')
+
+        book['isbn'] = book.get('isbn') or None
+
+        yield book
+
     def parse(self, response):
+        """
+        response: 将执行 start_requests() 里面的 scrapy.Request()得到的响应 作为这里的 response
+        """
         data = response.json()
         books = data['products']
         for book in books:
-            item_data = {
-                'title': book['title'],
-                'author': book['authorshipDisplay'],
-                'price': book['price'],
-                'url': book['link'],
-                'description': None,
-                'publisher': 'manning',
-            }
-            yield item_data
+            book['publisher'] = 'manning'
+            yield scrapy.Request(
+                url=book['link'],
+                callback=self.parse_detail,
+                meta={'book': book},
+            )
+
+        # 如果有下一页，且不是最后一页
+        # and 的优先级高于 :=，所以前半部分用括号括起来
+        if (pagination := data.get('pagination')) and pagination.get('hasNextPage'):
+            payload = self.payload.copy()
+            payload['page'] = pagination['page'] + 1
+
+            yield scrapy.Request(
+                url='https://www.manning.com/search/getCatalogData',
+                method='POST',
+                headers=self.headers,
+                body=json.dumps(payload),
+                callback=self.parse,
+            )
@@ -0,0 +1,83 @@
+"""
+Manning 出版社作者格式类型：
+1.单个作者："Taehun Kim"
+2.使用逗号分隔的多个作者："Benjamin Tan Wei Hao, Shanoop Padmanabhan, and Varun Mallya"
+3.包含 HTML 标签的作者："Constantin Gonciulea and Charlee Stefanski<br><i>Foreword by Heather Higgins<\u002fi>"
+4.使用 ...with...and...连接的作者："Jungjun Hur and Younghee Song"
+5.包含附加信息的作者： "Edward Raff, Drew Farris and Stella Biderman for Booz Allen Hamilton"
+
+"""
+import re
+from html import unescape
+
+
+def extract_authors(authorship_display: str):
+    # 处理转义符,如 <\u002fi> 其实是 </i>
+    s = unescape(authorship_display)
+
+    # 移除 HTML 标签
+    s = re.sub(r'<[^>]+>', '', s)
+
+    # 移除 Forward
+    s = re.sub(r'Forewords? by.*$', '', s, flags=re.IGNORECASE)
+
+    # 移除组织名
+    s = re.sub(r'\s+for\s+.*$', '', s, flags=re.IGNORECASE)
+
+    # 移除普通分隔符
+    s = s.replace(' and ', ', ')
+    s = s.replace(' with ', ', ')
+
+    # 切割成列表
+    authors = [a.strip() for a in s.split(',') if a.strip()]
+
+    return authors
+
+
+# if __name__ == '__main__':
+#     data = [
+#         "Reuven M. Lerner",
+#         "Taehun Kim",
+#         "Val Andrei Fajardo",
+#         "Benjamin Tan Wei Hao, Shanoop Padmanabhan, and Varun Mallya",
+#         "Luis G. Serrano",
+#         "Noah Flynn",
+#         "Aneev Kochakadan",
+#         "Tomasz Lelek and Artur Skowroński",
+#         "Roberto Infante",
+#         "Sebastian Raschka",
+#         "Luca Antiga, Eli Stevens, Howard Huang, Thomas Viehmann",
+#         "Nicole Koenigstein<br><i>Foreword by Luis Serrano</i>",
+#         "Wei-Meng Lee",
+#         "Pekka Enberg",
+#         "José Haro Peralta<br><i>Foreword by Dan Barahona</i>",
+#         "Alessandro Negro with Vlastimil Kus, Giuseppe Futia and Fabio Montagna<br><i>Forewords by Maxime Labonne, Khalifeh AlJadda</i>",
+#         "Will Kurt",
+#         "Rush Shahani",
+#         "Jungjun Hur and Younghee Song",
+#         "Mariia Mykhailova",
+#         "François Chollet and Matthew Watson",
+#         "Justin Mitchel",
+#         "Tyler Suard",
+#         "Tomaž Bratanič and Oskar Hane<br><i>Foreword by Paco Nathan</i>",
+#         "Ashish Ranjan Jha",
+#         "Sebastian Raschka and Abhinav Kimothi",
+#         "Edward Raff, Drew Farris and Stella Biderman for Booz Allen Hamilton",
+#         "Emmanuel Maggiori",
+#         "Abhinav Kimothi",
+#         "Gianluigi Mucciolo",
+#         "Gianluigi Mucciolo",
+#         "Gianluigi Mucciolo",
+#         "Gianluigi Mucciolo",
+#         "Gianluigi Mucciolo",
+#         "Vaibhav Verdhan<br><i>Foreword by Ravi Gopalakrishnan</i>",
+#         "Constantin Gonciulea and Charlee Stefanski<br><i>Foreword by Heather Higgins</i>",
+#         "Immanuel Trummer",
+#         "Christopher Kardell and Mark Brouwer",
+#         "Rob Reider and Alexander Michalka",
+#         "Mona Khalil<br><i>Foreword by Barry McCardel</i>"
+#     ]
+#     for name in data:
+#         result = extract_authors(name)
+#         print(result)
+#     print(unescape("Constantin Gonciulea and Charlee Stefanski<br><i>Foreword by Heather Higgins<\u002fi>"))
@@ -3,5 +3,4 @@
 
 if __name__ == '__main__':
     print(1)
-    print(2)
     execute(['scrapy', 'crawl', 'manning'])
@@ -0,0 +1,43 @@
+"""empty message
+
+Revision ID: 5d1bca12fab8
+Revises: 8aa8c270e402
+Create Date: 2026-01-01 13:48:56.078170
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import mysql
+
+# revision identifiers, used by Alembic.
+revision = '5d1bca12fab8'
+down_revision = '8aa8c270e402'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # 之前 isbn 找不到允许为空，后面找到了设置不允许为空
+    op.execute('''
+               UPDATE book
+               SET isbn = CONCAT('TEMP_', id)
+               WHERE isbn IS NULL
+               ''')
+
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('book', schema=None) as batch_op:
+        batch_op.alter_column('isbn',
+                              existing_type=mysql.VARCHAR(length=128),
+                              nullable=False)
+
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('book', schema=None) as batch_op:
+        batch_op.alter_column('isbn',
+                              existing_type=mysql.VARCHAR(length=128),
+                              nullable=True)
+
+    # ### end Alembic commands ###