havanagrawal · jd7h · Nov 9, 2023 · Nov 9, 2023
diff --git a/GoodreadsScraper/spiders/single_author_spider.py b/GoodreadsScraper/spiders/single_author_spider.py
@@ -0,0 +1,35 @@
+"""Spider to extract URL's of books from a single author"""
+
+import scrapy
+from scrapy import signals
+from .book_spider import BookSpider
+
+
+class SingleAuthorSpider(scrapy.Spider):
+    """Extract and crawl URLs of books by a specific author
+
+        This subsequently passes on the URLs to BookSpider.
+        Consequently, this spider also yields BookItem's and AuthorItem's.
+    """
+    name = "single-author"
+
+    def _set_crawler(self, crawler):
+        super()._set_crawler(crawler)
+        crawler.signals.connect(self.item_scraped_callback,
+                                signal=signals.item_scraped)
+
+    def __init__(self, author_id, item_scraped_callback=None):
+        super().__init__()
+        self.book_spider = BookSpider()
+        self.item_scraped_callback = item_scraped_callback
+        self.start_urls = [f"https://www.goodreads.com/author/show/{author_id}", f"https://www.goodreads.com/author/list/{author_id}"]
+
+    def parse(self, response):
+        book_urls = response.css("a.bookTitle::attr(href)").extract()
+
+        for book_url in book_urls:
+            yield response.follow(book_url, callback=self.book_spider.parse)
+
+        next_page = response.css('a.next_page').attrib['href']
+        if next_page is not None:
+            yield response.follow(next_page, callback=self.parse)
diff --git a/crawl.py b/crawl.py
@@ -160,6 +160,40 @@ def my_books(ctx, user_id: str, shelf: str, output_file_suffix: str):
             shelf=shelf,
             item_scraped_callback=progress_updater)
 
+@crawl.command()
+@click.option(
+    "--author_id",
+    required=True,
+    help="The author ID. This can be determined from the URL of the author, and is of the form '123456.firstname_lastname'",
+    prompt=True,
+    type=str)
+@click.option("--output_file_suffix",
+              help="The suffix for the output file. [default: author_id]",
+              type=str)
+@click.pass_context
+def single_author(ctx, author_id: str, output_file_suffix: str):
+    """Crawl books and author info for all books by a specific author."""
+    if not output_file_suffix:
+        output_file_suffix = author_id
+    click.echo(f"Crawling Goodreads author profile {author_id}")
+
+    # On the author's page, each page shows about ~30 books
+    # The last page may have less
+    # However, we don't know how many distinct works this author has
+    # So until we can figure it out, show an infinite spinner
+    progress_updater = ProgressUpdater(infinite=True)
+
+    with progress_updater.progress:
+        progress_updater.add_task_for(BookItem,
+                                    description=f"[red]Scraping books...")
+        progress_updater.add_task_for(AuthorItem,
+                                    description=f"[green]Scraping authors...")
+
+        _crawl('single-author',
+            ctx.obj["LOG_FILE"],
+            f"{output_file_suffix}",
+            author_id=author_id,
+            item_scraped_callback=progress_updater)
 
 def _crawl(spider_name, log_file, output_file_suffix, **crawl_kwargs):
     settings = get_project_settings()