Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions GoodreadsScraper/spiders/single_author_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Spider to extract URL's of books from a single author"""

import scrapy
from scrapy import signals
from .book_spider import BookSpider


class SingleAuthorSpider(scrapy.Spider):
"""Extract and crawl URLs of books by a specific author

This subsequently passes on the URLs to BookSpider.
Consequently, this spider also yields BookItem's and AuthorItem's.
"""
name = "single-author"

def _set_crawler(self, crawler):
super()._set_crawler(crawler)
crawler.signals.connect(self.item_scraped_callback,
signal=signals.item_scraped)

def __init__(self, author_id, item_scraped_callback=None):
super().__init__()
self.book_spider = BookSpider()
self.item_scraped_callback = item_scraped_callback
self.start_urls = [f"https://www.goodreads.com/author/show/{author_id}", f"https://www.goodreads.com/author/list/{author_id}"]

def parse(self, response):
book_urls = response.css("a.bookTitle::attr(href)").extract()

for book_url in book_urls:
yield response.follow(book_url, callback=self.book_spider.parse)

next_page = response.css('a.next_page').attrib['href']
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
34 changes: 34 additions & 0 deletions crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,40 @@ def my_books(ctx, user_id: str, shelf: str, output_file_suffix: str):
shelf=shelf,
item_scraped_callback=progress_updater)

@crawl.command()
@click.option(
"--author_id",
required=True,
help="The author ID. This can be determined from the URL of the author, and is of the form '123456.firstname_lastname'",
prompt=True,
type=str)
@click.option("--output_file_suffix",
help="The suffix for the output file. [default: author_id]",
type=str)
@click.pass_context
def single_author(ctx, author_id: str, output_file_suffix: str):
"""Crawl books and author info for all books by a specific author."""
if not output_file_suffix:
output_file_suffix = author_id
click.echo(f"Crawling Goodreads author profile {author_id}")

# On the author's page, each page shows about ~30 books
# The last page may have less
# However, we don't know how many distinct works this author has
# So until we can figure it out, show an infinite spinner
progress_updater = ProgressUpdater(infinite=True)

with progress_updater.progress:
progress_updater.add_task_for(BookItem,
description=f"[red]Scraping books...")
progress_updater.add_task_for(AuthorItem,
description=f"[green]Scraping authors...")

_crawl('single-author',
ctx.obj["LOG_FILE"],
f"{output_file_suffix}",
author_id=author_id,
item_scraped_callback=progress_updater)

def _crawl(spider_name, log_file, output_file_suffix, **crawl_kwargs):
settings = get_project_settings()
Expand Down