Skip to content

Commit

Permalink
Improve kindle functions
Browse files Browse the repository at this point in the history
  • Loading branch information
ahobsonsayers committed Apr 25, 2024
1 parent 0f43b35 commit a8db659
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 64 deletions.
122 changes: 69 additions & 53 deletions kindle/book.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@ package kindle

import (
"strings"
"time"

"github.com/ahobsonsayers/abs-goodreads/utils"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xpath"
"golang.org/x/net/html"
)

const publishDateLayout = "Jan 2, 2006"

var (
bookCoverSetExpr = xpath.MustCompile(`.//img/@srcset`)
bookFormatExpr = xpath.MustCompile(`.//a[contains(text(), "Kindle Edition")]//text()`)
Expand All @@ -17,52 +21,57 @@ var (
)

type Book struct {
ASIN string
Title string
Author string
Cover string
ASIN string
Title string
Author string
Cover string
PublishDate *time.Time
}

func BookFromSearchResultHTML(resultNode *html.Node) *Book {
if !isKindleBook(resultNode) {
return nil
// BooksFromHTML parses and returns the books from the html of a search results page
func BooksFromHTML(searchNode *html.Node) ([]Book, error) {
resultNodes := htmlquery.QuerySelectorAll(searchNode, searchResultsExpr)

books := make([]Book, 0, len(resultNodes))
for _, resultNode := range resultNodes {
if !isKindleBook(resultNode) {
continue
}

book := BookFromHTML(resultNode)
if book != nil {
books = append(books, *book)
}
}

asin := bookAsin(resultNode)
return books, nil
}

// BookFromHTML parses and returns a book from the html
// of a book result on the search results page
func BookFromHTML(bookNode *html.Node) *Book {
asin := bookAsin(bookNode)
if asin == "" {
return nil
}

title := bookTitle(resultNode)
title := bookTitle(bookNode)
if title == "" {
return nil
}

cover := bookCover(resultNode)
author := bookAuthor(resultNode)
cover := bookCover(bookNode)
author, publishDate := bookInfo(bookNode)

return &Book{
ASIN: asin,
Title: title,
Author: author,
Cover: cover,
ASIN: asin,
Title: title,
Author: author,
Cover: cover,
PublishDate: publishDate,
}
}

func BooksFromHTML(searchNode *html.Node) ([]Book, error) {
resultNodes := htmlquery.QuerySelectorAll(searchNode, searchResultsExpr)

books := make([]Book, 0, len(resultNodes))
for _, resultNode := range resultNodes {
book := BookFromSearchResultHTML(resultNode)
if book != nil {
books = append(books, *book)
}
}

return books, nil
}

func isKindleBook(bookNode *html.Node) bool {
bookFormatNode := htmlquery.QuerySelector(bookNode, bookFormatExpr)
if bookFormatNode == nil {
Expand All @@ -74,52 +83,52 @@ func isKindleBook(bookNode *html.Node) bool {
return strings.Contains(bookFormat, "kindle")
}

// bookAsin gets the book asim.
func bookAsin(bookNode *html.Node) string {
return htmlquery.SelectAttr(bookNode, "data-asin")
}

// bookTitle gets the book title.
func bookTitle(bookNode *html.Node) string {
titleNode := htmlquery.QuerySelector(bookNode, bookTitleExpr)
titleNodeValue := htmlquery.InnerText(titleNode)
return strings.TrimSpace(titleNodeValue)
}

// bookCover gets the book cover.
func bookCover(bookNode *html.Node) string {
coverSetAttr := htmlquery.QuerySelector(bookNode, bookCoverSetExpr)
coverSetAttrValue := htmlquery.InnerText(coverSetAttr)
return bookCoverFromCoverSetAttrValue(coverSetAttrValue)
return parseBookCoversAttrValue(coverSetAttrValue)
}

func bookAuthor(bookNode *html.Node) string {
// bookInfo gets additional book info.
// Return author and publish date (if found)
func bookInfo(bookNode *html.Node) (string, *time.Time) {
infoNode := htmlquery.QuerySelector(bookNode, bookInfoExpr)
bookInfoNodeValue := htmlquery.InnerText(infoNode)
return bookAuthorFromInfoNodeValue(bookInfoNodeValue)
return parseBookInfoNodeValue(bookInfoNodeValue)
}

func bookCoverFromCoverSetAttrValue(coverSetAttrValue string) string {
// Covers are separated by , and contain a zoom suffix e.g. 2x
coverUrlsWithZoom := strings.Split(coverSetAttrValue, ",")
if len(coverUrlsWithZoom) == 0 {
return ""
}

// Get cover urls without zoom
coverUrls := make([]string, 0, len(coverUrlsWithZoom))
for _, coverUrlWithZoom := range coverUrlsWithZoom {
coverUrl := strings.Fields(coverUrlWithZoom)[0]
coverUrls = append(coverUrls, coverUrl)
}

// Get largest cover (the last in the cover set)
largestCover := coverUrls[len(coverUrls)-1]

return largestCover
// parseBookCoversAttrValue parses the value of the book cover set attribute.
// Returns the url of the original/full-size book cover.
// See test for expected value format.
func parseBookCoversAttrValue(coverSetAttrValue string) string {
// Get first cover url from the cover set.
// This will not be original full size cover
modifiedCoverUrl := strings.Fields(coverSetAttrValue)[0]
originalCoverUrl := utils.SanitiseImageURL(modifiedCoverUrl)
return originalCoverUrl
}

func bookAuthorFromInfoNodeValue(bookInfoNodeValue string) string {
// parseBookInfoNodeValue parses the value of the book info node
// Returns author, publisher and date published.
// See test for expected value format.
func parseBookInfoNodeValue(bookInfoNodeValue string) (string, *time.Time) {
// Book info parts are separated by | the of which is the author
bookInfoParts := strings.Split(bookInfoNodeValue, "|")
bookAuthorPart := bookInfoParts[0]
bookAuthorPart := strings.TrimSpace(bookInfoParts[0])
publishDatePart := strings.TrimSpace(bookInfoParts[2])

// Strip out the "by" from the author part
bookAuthorFields := strings.Fields(bookAuthorPart)
Expand All @@ -130,5 +139,12 @@ func bookAuthorFromInfoNodeValue(bookInfoNodeValue string) string {
// Rejoin author fields
bookAuthor := strings.Join(bookAuthorFields, " ")

return bookAuthor
// Parse the date string according to the defined layout
var publishDate *time.Time
parsedPublishDate, err := time.Parse(publishDateLayout, publishDatePart)
if err == nil {
publishDate = &parsedPublishDate
}

return bookAuthor, publishDate
}
25 changes: 15 additions & 10 deletions kindle/book_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,25 @@ package kindle

import (
"testing"
"time"

"github.com/stretchr/testify/require"
)

func TestBookAuthorFromInfoNodeValue(t *testing.T) {
authorNodeValue := "by J.R.R. Tolkien | Sold by: HarperCollins Publishers | Feb 15, 2012"
author := bookAuthorFromInfoNodeValue(authorNodeValue)
expectedAuthor := "J.R.R. Tolkien"
require.Equal(t, expectedAuthor, author)
func TestParseBookCoversAttrValue(t *testing.T) {
coverUrl := "https://m.media-amazon.com/images/I/61Ng-W9EhBL.jpg"

coverSetAttrValue := "https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY218_.jpg 1x, https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY327_QL65_.jpg 1.5x, https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY436_QL65_.jpg 2x, https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY500_QL65_.jpg 2.2935x" // nolint
parsedCoverUrl := parseBookCoversAttrValue(coverSetAttrValue)
require.Equal(t, coverUrl, parsedCoverUrl)
}

func TestBookCoverFromCoverSetNodeValue(t *testing.T) {
coverSetNodeValue := "https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY218_.jpg 1x, https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY327_QL65_.jpg 1.5x, https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY436_QL65_.jpg 2x, https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY500_QL65_.jpg 2.2935x" // nolint
cover := bookCoverFromCoverSetAttrValue(coverSetNodeValue)
expectedCover := "https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY500_QL65_.jpg"
require.Equal(t, expectedCover, cover)
func TestParseBookInfoNodeValue(t *testing.T) {
author := "J.R.R. Tolkien"
publishDate := time.Date(2012, time.February, 15, 0, 0, 0, 0, time.UTC)

bookInfoNodeValue := "by J.R.R. Tolkien | Sold by: HarperCollins Publishers | Feb 15, 2012"
parsedAuthor, parsedPublishDate := parseBookInfoNodeValue(bookInfoNodeValue)
require.Equal(t, author, parsedAuthor)
require.Equal(t, publishDate, *parsedPublishDate)
}
2 changes: 1 addition & 1 deletion kindle/kindle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ func TestSearchBook(t *testing.T) {
require.Equal(t, "B007978NU6", book.ASIN)
require.Equal(t, "The Hobbit: 75th Anniversary Edition", book.Title)
require.Equal(t, "J.R.R. Tolkien and Christopher Tolkien", book.Author)
require.Equal(t, "https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY500_QL65_.jpg", book.Cover)
require.Equal(t, "https://m.media-amazon.com/images/I/61Ng-W9EhBL.jpg", book.Cover)
}

0 comments on commit a8db659

Please sign in to comment.