diff --git a/go.mod b/go.mod index d6c357d..b6e08f6 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,8 @@ module github.com/ahobsonsayers/abs-goodreads go 1.21 require ( + github.com/antchfx/htmlquery v1.3.1 + github.com/antchfx/xpath v1.3.0 github.com/deckarep/golang-set/v2 v2.6.0 github.com/deepmap/oapi-codegen/v2 v2.1.0 github.com/getkin/kin-openapi v0.122.0 @@ -16,6 +18,7 @@ require ( github.com/orsinium-labs/enum v1.4.0 github.com/samber/lo v1.39.0 github.com/stretchr/testify v1.9.0 + golang.org/x/net v0.19.0 golang.org/x/text v0.14.0 ) @@ -24,6 +27,7 @@ require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect github.com/go-openapi/swag v0.22.4 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/google/uuid v1.5.0 // indirect github.com/gorilla/mux v1.8.0 // indirect github.com/invopop/yaml v0.2.0 // indirect diff --git a/go.sum b/go.sum index fc5c347..700b7d3 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,8 @@ github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= +github.com/antchfx/htmlquery v1.3.1 h1:wm0LxjLMsZhRHfQKKZscDf2COyH4vDYA3wyH+qZ+Ylc= +github.com/antchfx/htmlquery v1.3.1/go.mod h1:PTj+f1V2zksPlwNt7uVvZPsxpKNa7mlVliCRxLX6Nx8= +github.com/antchfx/xpath v1.3.0 h1:nTMlzGAK3IJ0bPpME2urTuFL76o4A96iYvoKFHRXJgc= +github.com/antchfx/xpath v1.3.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ= github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk= github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= @@ -23,6 +27,8 @@ github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogB github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= github.com/go-test/deep v1.0.8 h1:TDsG77qcSprGbC6vTN8OuXp5g+J+b5Pcguhf7Zt61VM= github.com/go-test/deep v1.0.8/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= @@ -98,6 +104,9 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= +golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= diff --git a/kindle/book.go b/kindle/book.go new file mode 100644 index 0000000..24d6790 --- /dev/null +++ b/kindle/book.go @@ -0,0 +1,134 @@ +package kindle + +import ( + "strings" + + "github.com/antchfx/htmlquery" + "github.com/antchfx/xpath" + "golang.org/x/net/html" +) + +var ( + bookCoverSetExpr = xpath.MustCompile(`.//img/@srcset`) + bookFormatExpr = xpath.MustCompile(`.//a[contains(text(), "Kindle Edition")]//text()`) + bookInfoExpr = xpath.MustCompile(`.//div[contains(@class, "a-color-secondary")]`) + bookTitleExpr = xpath.MustCompile(`.//h2`) + searchResultsExpr = xpath.MustCompile(`//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]`) +) + +type Book struct { + ASIN string + Title string + Author string + Cover string +} + +func BookFromSearchResultHTML(resultNode *html.Node) *Book { + if !isKindleBook(resultNode) { + return nil + } + + asin := bookAsin(resultNode) + if asin == "" { + return nil + } + + title := bookTitle(resultNode) + if title == "" { + return nil + } + + cover := bookCover(resultNode) + author := bookAuthor(resultNode) + + return &Book{ + ASIN: asin, + Title: title, + Author: author, + Cover: cover, + } +} + +func BooksFromHTML(searchNode *html.Node) ([]Book, error) { + resultNodes := htmlquery.QuerySelectorAll(searchNode, searchResultsExpr) + + books := make([]Book, 0, len(resultNodes)) + for _, resultNode := range resultNodes { + book := BookFromSearchResultHTML(resultNode) + if book != nil { + books = append(books, *book) + } + } + + return books, nil +} + +func isKindleBook(bookNode *html.Node) bool { + bookFormatNode := htmlquery.QuerySelector(bookNode, bookFormatExpr) + if bookFormatNode == nil { + return false + } + bookFormatNodeValue := htmlquery.InnerText(bookFormatNode) + + bookFormat := strings.ToLower(bookFormatNodeValue) + return strings.Contains(bookFormat, "kindle") +} + +func bookAsin(bookNode *html.Node) string { + return htmlquery.SelectAttr(bookNode, "data-asin") +} + +func bookTitle(bookNode *html.Node) string { + titleNode := htmlquery.QuerySelector(bookNode, bookTitleExpr) + titleNodeValue := htmlquery.InnerText(titleNode) + return strings.TrimSpace(titleNodeValue) +} + +func bookCover(bookNode *html.Node) string { + coverSetAttr := htmlquery.QuerySelector(bookNode, bookCoverSetExpr) + coverSetAttrValue := htmlquery.InnerText(coverSetAttr) + return bookCoverFromCoverSetAttrValue(coverSetAttrValue) +} + +func bookAuthor(bookNode *html.Node) string { + infoNode := htmlquery.QuerySelector(bookNode, bookInfoExpr) + bookInfoNodeValue := htmlquery.InnerText(infoNode) + return bookAuthorFromInfoNodeValue(bookInfoNodeValue) +} + +func bookCoverFromCoverSetAttrValue(coverSetAttrValue string) string { + // Covers are separated by , and contain a zoom suffix e.g. 2x + coverUrlsWithZoom := strings.Split(coverSetAttrValue, ",") + if len(coverUrlsWithZoom) == 0 { + return "" + } + + // Get cover urls without zoom + coverUrls := make([]string, 0, len(coverUrlsWithZoom)) + for _, coverUrlWithZoom := range coverUrlsWithZoom { + coverUrl := strings.Fields(coverUrlWithZoom)[0] + coverUrls = append(coverUrls, coverUrl) + } + + // Get largest cover (the last in the cover set) + largestCover := coverUrls[len(coverUrls)-1] + + return largestCover +} + +func bookAuthorFromInfoNodeValue(bookInfoNodeValue string) string { + // Book info parts are separated by | the of which is the author + bookInfoParts := strings.Split(bookInfoNodeValue, "|") + bookAuthorPart := bookInfoParts[0] + + // Strip out the "by" from the author part + bookAuthorFields := strings.Fields(bookAuthorPart) + if len(bookAuthorFields) > 1 && strings.EqualFold(bookAuthorFields[0], "by") { + bookAuthorFields = bookAuthorFields[1:] + } + + // Rejoin author fields + bookAuthor := strings.Join(bookAuthorFields, " ") + + return bookAuthor +} diff --git a/kindle/book_test.go b/kindle/book_test.go new file mode 100644 index 0000000..250ecc4 --- /dev/null +++ b/kindle/book_test.go @@ -0,0 +1,21 @@ +package kindle + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestBookAuthorFromInfoNodeValue(t *testing.T) { + authorNodeValue := "by J.R.R. Tolkien | Sold by: HarperCollins Publishers | Feb 15, 2012" + author := bookAuthorFromInfoNodeValue(authorNodeValue) + expectedAuthor := "J.R.R. Tolkien" + require.Equal(t, expectedAuthor, author) +} + +func TestBookCoverFromCoverSetNodeValue(t *testing.T) { + coverSetNodeValue := "https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY218_.jpg 1x, https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY327_QL65_.jpg 1.5x, https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY436_QL65_.jpg 2x, https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY500_QL65_.jpg 2.2935x" // nolint + cover := bookCoverFromCoverSetAttrValue(coverSetNodeValue) + expectedCover := "https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY500_QL65_.jpg" + require.Equal(t, expectedCover, cover) +} diff --git a/kindle/kindle.go b/kindle/kindle.go new file mode 100644 index 0000000..a06b9a8 --- /dev/null +++ b/kindle/kindle.go @@ -0,0 +1,87 @@ +package kindle + +import ( + "context" + "net/http" + "net/url" + + "github.com/ahobsonsayers/abs-goodreads/utils" + "github.com/samber/lo" + "golang.org/x/net/html" +) + +const DefaultAmazonURL = "https://www.amazon.com" + +var ( + defaultAmazonURL = lo.Must(url.Parse(DefaultAmazonURL)) + + DefaultClient = &Client{ + client: http.DefaultClient, + amazonUrl: utils.CloneURL(defaultAmazonURL), + } +) + +type Client struct { + client *http.Client + amazonUrl *url.URL +} + +// URL returns a clone of of the amazon url used by the client +func (c *Client) URL() *url.URL { return utils.CloneURL(c.amazonUrl) } + +func (c *Client) get( + ctx context.Context, + path string, + parameters map[string]string, +) (*html.Node, error) { + queryParams := url.Values{} + for key, value := range parameters { + queryParams.Add(key, value) + } + + requestUrl := c.URL() + requestUrl = requestUrl.JoinPath(path) + requestUrl.RawQuery = queryParams.Encode() + + request, err := http.NewRequestWithContext(ctx, http.MethodGet, requestUrl.String(), http.NoBody) + if err != nil { + return nil, err + } + + request.Header.Set("User-Agent", "") // Amazon blocks some user agents + + response, err := http.DefaultClient.Do(request) + if err != nil { + return nil, nil + } + defer response.Body.Close() + + httpError := utils.HTTPResponseError(response) + if httpError != nil { + return nil, httpError + } + + htmlResponse, err := html.Parse(response.Body) + if err != nil { + return nil, nil + } + + return htmlResponse, nil +} + +func (c *Client) Search(ctx context.Context, title string, author *string) ([]Book, error) { + parameters := map[string]string{ + "i": "digital-text", + "k": title, + } + if author != nil && *author != "" { + parameters["inauthor"] = *author + } + + htmlResponse, err := c.get(ctx, "s", parameters) + if err != nil { + return nil, err + } + + return BooksFromHTML(htmlResponse) +} diff --git a/kindle/kindle_test.go b/kindle/kindle_test.go new file mode 100644 index 0000000..bc518bb --- /dev/null +++ b/kindle/kindle_test.go @@ -0,0 +1,28 @@ +package kindle_test + +import ( + "context" + "testing" + + "github.com/ahobsonsayers/abs-goodreads/kindle" + "github.com/samber/lo" + "github.com/stretchr/testify/require" +) + +const ( + TheHobbitBookTitle = "The Hobbit" + TheHobbitBookAuthor = "J. R. R. Tolkien" +) + +func TestSearchBook(t *testing.T) { + // Should return https://www.amazon.com/dp/B007978NU6 + books, err := kindle.DefaultClient.Search(context.Background(), TheHobbitBookTitle, lo.ToPtr(TheHobbitBookAuthor)) + require.NoError(t, err) + require.NotEmpty(t, books) + + book := books[0] + require.Equal(t, "B007978NU6", book.ASIN) + require.Equal(t, "The Hobbit: 75th Anniversary Edition", book.Title) + require.Equal(t, "J.R.R. Tolkien and Christopher Tolkien", book.Author) + require.Equal(t, "https://m.media-amazon.com/images/I/61Ng-W9EhBL._AC_UY500_QL65_.jpg", book.Cover) +} diff --git a/kindle/new.go b/kindle/new.go new file mode 100644 index 0000000..c1312ec --- /dev/null +++ b/kindle/new.go @@ -0,0 +1,47 @@ +package kindle + +import ( + "fmt" + "net/http" + "net/url" + "strings" + + "github.com/ahobsonsayers/abs-goodreads/utils" + "github.com/samber/lo" +) + +var countryAmazonURLs = map[string]*url.URL{ + "au": lo.Must(url.Parse("https://www.amazon.com.au")), + "ca": lo.Must(url.Parse("https://www.amazon.ca")), + "de": lo.Must(url.Parse("https://www.amazon.de")), + "es": lo.Must(url.Parse("https://www.amazon.es")), + "fr": lo.Must(url.Parse("https://www.amazon.fr")), + "in": lo.Must(url.Parse("https://www.amazon.co.in")), + "it": lo.Must(url.Parse("https://www.amazon.it")), + "jp": lo.Must(url.Parse("https://www.amazon.co.jp")), + "uk": lo.Must(url.Parse("https://www.amazon.co.uk")), + "us": defaultAmazonURL, +} + +// If client is nil, the default http client will be used. +// If country code is nil or unset, amazon.com will be used as the url. +// Will return an error if the country code is invalid. +func NewClient(client *http.Client, countryCode *string) (*Client, error) { + if client == nil { + client = http.DefaultClient + } + + amazonUrl := defaultAmazonURL + if countryCode != nil && *countryCode != "" { + countryAmazonUrl, ok := countryAmazonURLs[strings.Trim(*countryCode, "/")] + if !ok { + return nil, fmt.Errorf("invalid country code: %s", *countryCode) + } + amazonUrl = countryAmazonUrl + } + + return &Client{ + client: client, + amazonUrl: utils.CloneURL(amazonUrl), + }, nil +} diff --git a/kindle/new_test.go b/kindle/new_test.go new file mode 100644 index 0000000..725527b --- /dev/null +++ b/kindle/new_test.go @@ -0,0 +1,23 @@ +package kindle_test + +import ( + "net/http" + "testing" + + "github.com/ahobsonsayers/abs-goodreads/kindle" + "github.com/stretchr/testify/require" +) + +func TestNewNoParameters(t *testing.T) { + client, err := kindle.NewClient(nil, nil) + require.NoError(t, err) + require.Equal(t, kindle.DefaultAmazonURL, client.URL().String()) +} + +func TestNewWithParameters(t *testing.T) { + httpClient := &http.Client{} + countryCode := "es" + client, err := kindle.NewClient(httpClient, &countryCode) + require.NoError(t, err) + require.Equal(t, "https://www.amazon.es", client.URL().String()) +}