diff --git a/.github/workflows/jisungin_dev.yml b/.github/workflows/jisungin_dev.yml index 7f1ad86..847a231 100644 --- a/.github/workflows/jisungin_dev.yml +++ b/.github/workflows/jisungin_dev.yml @@ -48,6 +48,14 @@ jobs: DEV_SECRET_DIR_FILE_NAME: application-oauth.yml run: echo $DEV_SECRET | base64 --decode >> $DEV_SECRET_DIR/$DEV_SECRET_DIR_FILE_NAME + # application-crawler.yml + - name: Copy crawler Secret + env: + DEV_SECRET: ${{ secrets.APPLICATION_CRAWLER_YML }} + DEV_SECRET_DIR: src/main/resources + DEV_SECRET_DIR_FILE_NAME: application-crawler.yml + run: echo $DEV_SECRET | base64 --decode >> $DEV_SECRET_DIR/$DEV_SECRET_DIR_FILE_NAME + # application-jwt.yml # - name: Copy jwt Secret # env: diff --git a/.gitignore b/.gitignore index 6bd7fc8..b7a6e16 100644 --- a/.gitignore +++ b/.gitignore @@ -40,4 +40,5 @@ out/ application-dev.yml application-prod.yml application-jwt.yml -application-oauth.yml \ No newline at end of file +application-oauth.yml +application-crawler.yml \ No newline at end of file diff --git a/build.gradle b/build.gradle index 438b377..cbe4611 100644 --- a/build.gradle +++ b/build.gradle @@ -58,7 +58,8 @@ dependencies { runtimeOnly 'io.netty:netty-resolver-dns-native-macos:4.1.104.Final:osx-aarch_64' // Jsoup Web Crawling Library implementation 'org.jsoup:jsoup:1.16.2' - + // JsonPath Parse Json Library + implementation 'com.jayway.jsonpath:json-path:2.9.0' } tasks.named('bootBuildImage') { diff --git a/src/main/java/com/jisungin/exception/ErrorCode.java b/src/main/java/com/jisungin/exception/ErrorCode.java index f8d5bc2..e6a3e33 100644 --- a/src/main/java/com/jisungin/exception/ErrorCode.java +++ b/src/main/java/com/jisungin/exception/ErrorCode.java @@ -16,7 +16,8 @@ public enum ErrorCode { TALK_ROOM_NOT_FOUND(400, "토크방을 찾을 수 없습니다."), UNAUTHORIZED_REQUEST(400, "권한이 없는 사용자입니다."), COMMENT_NOT_FOUND(404, "의견을 찾을 수 없습니다."), - REVIEW_NOT_FOUND(404, "리뷰를 찾을 수 없습니다."); + REVIEW_NOT_FOUND(404, "리뷰를 찾을 수 없습니다."), + REQUEST_TIME_OUT(408, "요청 시간이 만료 되었습니다."); private final int code; diff --git a/src/main/java/com/jisungin/infra/crawler/Crawler.java b/src/main/java/com/jisungin/infra/crawler/Crawler.java index 653e128..5ffc118 100644 --- a/src/main/java/com/jisungin/infra/crawler/Crawler.java +++ b/src/main/java/com/jisungin/infra/crawler/Crawler.java @@ -1,7 +1,10 @@ package com.jisungin.infra.crawler; +import java.util.Map; + public interface Crawler { CrawlingBook crawlBook(String isbn); + Map crawlBestSellerBook(); } diff --git a/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java b/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java index 9de8f88..54fa4e9 100644 --- a/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java +++ b/src/main/java/com/jisungin/infra/crawler/CrawlingBook.java @@ -1,24 +1,47 @@ package com.jisungin.infra.crawler; +import java.time.LocalDateTime; import lombok.Builder; import lombok.Getter; +import lombok.ToString; @Getter +@ToString public class CrawlingBook { - private String imageUrl; + private String title; private String content; + private String isbn; + private String publisher; + private String imageUrl; + private String thumbnail; + private String[] authors; + private LocalDateTime dateTime; @Builder - private CrawlingBook(String imageUrl, String content) { - this.imageUrl = imageUrl; + private CrawlingBook(String title, String content, String isbn, String publisher, String imageUrl, String thumbnail, + String authors, LocalDateTime dateTime) { + this.title = title; this.content = content; + this.isbn = isbn; + this.publisher = publisher; + this.imageUrl = imageUrl; + this.thumbnail = thumbnail; + this.authors = parseAuthorsToArr(authors); + this.dateTime = dateTime; } - public static CrawlingBook of(String imageUrl, String content) { + public static CrawlingBook of(String title, String content, String isbn, String publisher, String imageUrl, + String thumbnail, String authors, LocalDateTime dateTime) { return CrawlingBook.builder() - .imageUrl(imageUrl) + .title(title) .content(content) + .isbn(isbn) + .publisher(publisher) + .imageUrl(imageUrl) + .thumbnail(thumbnail) + .authors(authors) + .dateTime(dateTime) .build(); } @@ -26,4 +49,8 @@ public boolean isBlankContent() { return this.content.isBlank(); } + private String[] parseAuthorsToArr(String authors) { + return authors.split(" 저| 공저| 글| 편저| 원저")[0].split(","); + } + } diff --git a/src/main/java/com/jisungin/infra/crawler/Fetcher.java b/src/main/java/com/jisungin/infra/crawler/Fetcher.java index d327ed4..e95ccdd 100644 --- a/src/main/java/com/jisungin/infra/crawler/Fetcher.java +++ b/src/main/java/com/jisungin/infra/crawler/Fetcher.java @@ -6,5 +6,6 @@ public interface Fetcher { Document fetchIsbn(String isbn); Document fetchBook(String bookId); + Document fetchBestSellerBookId(); } diff --git a/src/main/java/com/jisungin/infra/crawler/Parser.java b/src/main/java/com/jisungin/infra/crawler/Parser.java index 5bacb95..5d61182 100644 --- a/src/main/java/com/jisungin/infra/crawler/Parser.java +++ b/src/main/java/com/jisungin/infra/crawler/Parser.java @@ -1,10 +1,12 @@ package com.jisungin.infra.crawler; +import java.util.Map; import org.jsoup.nodes.Document; public interface Parser { String parseIsbn(Document doc); CrawlingBook parseBook(Document doc); + Map parseBestSellerBookId(Document doc); } diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java b/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java index 3b52e3b..97e7127 100644 --- a/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java @@ -1,5 +1,9 @@ package com.jisungin.infra.crawler; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Component; @@ -17,4 +21,19 @@ public CrawlingBook crawlBook(String isbn) { return parser.parseBook(fetcher.fetchBook(bookId)); } + @Override + public Map crawlBestSellerBook() { + Map bestSellerBookIds = parser.parseBestSellerBookId(fetcher.fetchBestSellerBookId()); + Map bestSellerBooks = new HashMap<>(); + + List> futures = bestSellerBookIds.entrySet().stream() + .map(entry -> CompletableFuture.supplyAsync(() -> parser.parseBook(fetcher.fetchBook(entry.getValue()))) + .thenAccept(crawlingBook -> bestSellerBooks.put(entry.getKey(), crawlingBook))) + .toList(); + + CompletableFuture.allOf(futures.toArray(CompletableFuture[]::new)).join(); + + return bestSellerBooks; + } + } diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java b/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java deleted file mode 100644 index d608123..0000000 --- a/src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.jisungin.infra.crawler; - -public class Yes24CrawlerConstant { - - public static final String BASE_URL = "https://www.yes24.com/Product"; - public static final String ISBN_URL = BASE_URL + "/Search?domain=BOOK&query="; - public static final String BOOK_URL = BASE_URL + "/Goods/"; - public static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36"; - public static final String ISBN_CSS = "ul#yesSchList > li"; - public static final String ISBN_ATTR = "data-goods-no"; - public static final String BOOK_IMAGE_CSS = "span.gd_img > em.imgBdr > img.gImg"; - public static final String BOOK_IMAGE_ATTR = "src"; - public static final String BOOK_CONTENT_CSS = "div.infoWrap_txt > div.infoWrap_txtInner"; - - public static String getIsbnUrl(String isbn) { - return ISBN_URL + isbn; - } - - public static String getBookUrl(String bookId) { - return BOOK_URL + bookId; - } - -} diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java b/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java index 8288cb7..ac6efc9 100644 --- a/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java @@ -1,25 +1,34 @@ package com.jisungin.infra.crawler; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.*; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.USER_AGENT; - import com.jisungin.exception.BusinessException; import com.jisungin.exception.ErrorCode; +import java.net.SocketTimeoutException; +import lombok.Setter; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.stereotype.Component; @Component +@Setter +@ConfigurationProperties(prefix = "crawler.yes24.fetcher") public class Yes24Fetcher implements Fetcher { + private String isbnUrl; + private String bookUrl; + private String bestBookUrl; + private String userAgent; + @Override public Document fetchIsbn(String isbn) { try { return Jsoup.connect(getIsbnUrl(isbn)) .timeout(5000) - .userAgent(USER_AGENT) + .userAgent(userAgent) .ignoreContentType(true) .get(); + } catch (SocketTimeoutException e) { + throw new BusinessException(ErrorCode.REQUEST_TIME_OUT); } catch (Exception e) { throw new BusinessException(ErrorCode.BOOK_NOT_FOUND); } @@ -30,12 +39,37 @@ public Document fetchBook(String bookId) { try { return Jsoup.connect(getBookUrl(bookId)) .timeout(5000) - .userAgent(USER_AGENT) + .userAgent(userAgent) + .ignoreContentType(true) + .get(); + } catch (SocketTimeoutException e) { + throw new BusinessException(ErrorCode.REQUEST_TIME_OUT); + } catch (Exception e) { + throw new BusinessException(ErrorCode.BOOK_NOT_FOUND); + } + } + + @Override + public Document fetchBestSellerBookId() { + try { + return Jsoup.connect(bestBookUrl) + .timeout(5000) + .userAgent(userAgent) .ignoreContentType(true) .get(); + } catch (SocketTimeoutException e) { + throw new BusinessException(ErrorCode.REQUEST_TIME_OUT); } catch (Exception e) { throw new BusinessException(ErrorCode.BOOK_NOT_FOUND); } } + private String getIsbnUrl(String isbn) { + return isbnUrl + isbn; + } + + private String getBookUrl(String bookId) { + return bookUrl + bookId; + } + } diff --git a/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java b/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java index 637ab94..8b768a2 100644 --- a/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java +++ b/src/main/java/com/jisungin/infra/crawler/Yes24Parser.java @@ -1,29 +1,78 @@ package com.jisungin.infra.crawler; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_CONTENT_CSS; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_ATTR; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_CSS; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_ATTR; -import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_CSS; - +import com.jayway.jsonpath.JsonPath; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import lombok.Setter; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.jsoup.safety.Safelist; +import org.jsoup.select.Elements; +import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.stereotype.Component; @Component +@Setter +@ConfigurationProperties(prefix = "crawler.yes24.parser") public class Yes24Parser implements Parser { + + private String isbnCss; + private String isbnAttr; + private String bookContentCss; + private String bookJsonCss; + private String bestRankingCss; + private String bestIdCss; + private String bestIdAttrs; + @Override public String parseIsbn(Document doc) { - return doc.select(ISBN_CSS).attr(ISBN_ATTR); + return doc.select(isbnCss).attr(isbnAttr); } @Override public CrawlingBook parseBook(Document doc) { - String image = doc.select(BOOK_IMAGE_CSS).attr(BOOK_IMAGE_ATTR); - String content = Jsoup.clean(doc.select(BOOK_CONTENT_CSS).text(), Safelist.none()); + String json = doc.select(bookJsonCss).html(); + + String title = parseJsonToString(json, "$.name"); + String isbn = parseJsonToString(json, "$.workExample[0].isbn"); + String imageUrl = parseJsonToString(json, "$.image"); + String publisher = parseJsonToString(json, "$.publisher.name"); + String authors = parseJsonToString(json, "$.author.name"); + String thumbnail = imageUrl.replace("XL", "M"); + String content = Jsoup.clean(doc.select(bookContentCss).text(), Safelist.none()); + LocalDateTime dateTime = parseDate(parseJsonToString(json, "$.workExample[0].datePublished")); + + return CrawlingBook.of(title, content, isbn, publisher, imageUrl, thumbnail, authors, dateTime); + } + + @Override + public Map parseBestSellerBookId(Document doc) { + Elements rankings = doc.select(bestRankingCss); + List bookIds = doc.select(bestIdCss) + .eachAttr(bestIdAttrs); + + return IntStream.range(0, rankings.size()) + .boxed() + .collect(Collectors.toMap( + i -> parseRanking(rankings.get(i)), + bookIds::get)); + } + + private Long parseRanking(Element rankingElement) { + return Long.parseLong(rankingElement.text()); + } + + private String parseJsonToString(String json, String path) { + return JsonPath.read(json, path); + } - return CrawlingBook.of(image, content); + private LocalDateTime parseDate(String dateString) { + return LocalDate.parse(dateString).atStartOfDay(); } } diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 735ecfd..34de983 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -10,4 +10,5 @@ spring: prod-env: - prod include: - oauth \ No newline at end of file + - oauth + - crawler \ No newline at end of file diff --git a/src/test/java/com/jisungin/application/service/book/BookServiceTest.java b/src/test/java/com/jisungin/application/service/book/BookServiceTest.java index 02c8094..2bc5747 100644 --- a/src/test/java/com/jisungin/application/service/book/BookServiceTest.java +++ b/src/test/java/com/jisungin/application/service/book/BookServiceTest.java @@ -85,7 +85,8 @@ public void createBook() { .build(); when(crawler.crawlBook(request.getIsbn())) - .thenReturn(CrawlingBook.of("도서 imageUrl", "도서 내용")); + .thenReturn(CrawlingBook.of("도서 제목", "도서 내용", "123456789X", "도서 출판사", + "도서 imageUrl", "도서 썸네일", "도서 저자1, 도서 저자2", registeredDateTime)); // when BookResponse response = bookService.createBook(request); diff --git a/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java b/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java index 9bbd27f..9269539 100644 --- a/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java +++ b/src/test/java/com/jisungin/infra/Yes24CrawlerTest.java @@ -2,6 +2,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -11,6 +12,9 @@ import com.jisungin.infra.crawler.Yes24Crawler; import com.jisungin.infra.crawler.Yes24Fetcher; import com.jisungin.infra.crawler.Yes24Parser; +import java.time.LocalDateTime; +import java.util.HashMap; +import java.util.Map; import org.jsoup.nodes.Document; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; @@ -41,7 +45,10 @@ public void crawlingBook() { Document isbnDocument = mock(Document.class); Document bookDocument = mock(Document.class); - CrawlingBook crawlingBook = CrawlingBook.of("image url link", "crawling content"); + LocalDateTime registeredTime = LocalDateTime.of(2024, 1, 1, 0, 0); + + CrawlingBook crawlingBook = CrawlingBook.of("도서 제목", "도서 내용", "도서 ISBN", + "도서 출판사", "도서 이미지 링크", "도서 썸네일", "도서 작가1", registeredTime); when(fetcher.fetchIsbn(isbn)).thenReturn(isbnDocument); when(fetcher.fetchBook(bookId)).thenReturn(bookDocument); @@ -69,4 +76,41 @@ public void crawlingBookWithInvalidIsbn() { .hasMessage("책을 찾을 수 없습니다."); } + @Test + @DisplayName("베스트 셀러 책을 크롤링 한다.") + public void crawlingBestSeller() { + // given + Document bestSellerBookIdsDoc = mock(Document.class); + Document fetchBookDoc1 = mock(Document.class); + Document fetchBookDoc2 = mock(Document.class); + + Map bestSellerBookIds = new HashMap<>(); + bestSellerBookIds.put(1L, "00001"); + bestSellerBookIds.put(2L, "00002"); + + when(fetcher.fetchBestSellerBookId()).thenReturn(bestSellerBookIdsDoc); + when(parser.parseBestSellerBookId(any(Document.class))).thenReturn(bestSellerBookIds); + + when(fetcher.fetchBook("00001")).thenReturn(fetchBookDoc1); + when(fetcher.fetchBook("00002")).thenReturn(fetchBookDoc2); + + CrawlingBook book1 = CrawlingBook.of("책 제목1", "책 내용1", "책 ISBN1", "책 출판사1", + "책 이미지 URL1", "책 썸네일1", "책 저자1, 책 저자2", + LocalDateTime.of(2024, 1, 1, 0, 0)); + CrawlingBook book2 = CrawlingBook.of("책 제목2", "책 내용2", "책 ISBN2", "책 출판사2", + "책 이미지 URL2", "책 썸네일2", "책 저자3, 책 저자4", + LocalDateTime.of(2024, 1, 1, 0, 0)); + + when(parser.parseBook(fetchBookDoc1)).thenReturn(book1); + when(parser.parseBook(fetchBookDoc2)).thenReturn(book2); + + // when + Map bestSellerBooks = crawler.crawlBestSellerBook(); + + // then + assertThat(bestSellerBooks.size()).isEqualTo(2); + assertThat(bestSellerBooks.get(1L)).isEqualTo(book1); + assertThat(bestSellerBooks.get(2L)).isEqualTo(book2); + } + } diff --git a/src/test/resources/application.yml b/src/test/resources/application.yml index 67c8661..39d233d 100644 --- a/src/test/resources/application.yml +++ b/src/test/resources/application.yml @@ -19,4 +19,20 @@ spring: url: jdbc:h2:mem:jisungin username: sa password: - driver-class-name: org.h2.Driver \ No newline at end of file + driver-class-name: org.h2.Driver + +crawler: + yes24: + fetcher: + isbnUrl: "https://www.yes24.com/Product/Search?domain=BOOK&query=" + bookUrl: "https://www.yes24.com/Product/Goods/" + bestBookUrl: "https://www.yes24.com/Product/Category/BestSeller?categoryNumber=001&pageNumber=1&pageSize=100" + userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36" + parser: + isbnCss: "ul#yesSchList > li" + isbnAttr: "data-goods-no" + bookContentCss: "div.infoWrap_txt > div.infoWrap_txtInner" + bookJsonCss: "script[type=application/ld+json]" + bestRankingCss: "div.img_upper > em.ico.rank" + bestIdCss: "ul#yesBestList > li" + bestIdAttrs: "data-goods-no" \ No newline at end of file