Skip to content

Commit

Permalink
Merge pull request #42 from jisung-in/feature/34-crawling-best-book
Browse files Browse the repository at this point in the history
[Feature] 베스트 셀러 크롤링 기능 추가
  • Loading branch information
jwooo authored Mar 27, 2024
2 parents 04cfd5a + 40af1c1 commit 555e2c9
Show file tree
Hide file tree
Showing 16 changed files with 235 additions and 50 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/jisungin_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ jobs:
DEV_SECRET_DIR_FILE_NAME: application-oauth.yml
run: echo $DEV_SECRET | base64 --decode >> $DEV_SECRET_DIR/$DEV_SECRET_DIR_FILE_NAME

# application-crawler.yml
- name: Copy crawler Secret
env:
DEV_SECRET: ${{ secrets.APPLICATION_CRAWLER_YML }}
DEV_SECRET_DIR: src/main/resources
DEV_SECRET_DIR_FILE_NAME: application-crawler.yml
run: echo $DEV_SECRET | base64 --decode >> $DEV_SECRET_DIR/$DEV_SECRET_DIR_FILE_NAME

# application-jwt.yml
# - name: Copy jwt Secret
# env:
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ out/
application-dev.yml
application-prod.yml
application-jwt.yml
application-oauth.yml
application-oauth.yml
application-crawler.yml
3 changes: 2 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ dependencies {
runtimeOnly 'io.netty:netty-resolver-dns-native-macos:4.1.104.Final:osx-aarch_64'
// Jsoup Web Crawling Library
implementation 'org.jsoup:jsoup:1.16.2'

// JsonPath Parse Json Library
implementation 'com.jayway.jsonpath:json-path:2.9.0'
}

tasks.named('bootBuildImage') {
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/com/jisungin/exception/ErrorCode.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ public enum ErrorCode {
TALK_ROOM_NOT_FOUND(400, "토크방을 찾을 수 없습니다."),
UNAUTHORIZED_REQUEST(400, "권한이 없는 사용자입니다."),
COMMENT_NOT_FOUND(404, "의견을 찾을 수 없습니다."),
REVIEW_NOT_FOUND(404, "리뷰를 찾을 수 없습니다.");
REVIEW_NOT_FOUND(404, "리뷰를 찾을 수 없습니다."),
REQUEST_TIME_OUT(408, "요청 시간이 만료 되었습니다.");


private final int code;
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/com/jisungin/infra/crawler/Crawler.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package com.jisungin.infra.crawler;

import java.util.Map;

public interface Crawler {

CrawlingBook crawlBook(String isbn);
Map<Long, CrawlingBook> crawlBestSellerBook();

}
37 changes: 32 additions & 5 deletions src/main/java/com/jisungin/infra/crawler/CrawlingBook.java
Original file line number Diff line number Diff line change
@@ -1,29 +1,56 @@
package com.jisungin.infra.crawler;

import java.time.LocalDateTime;
import lombok.Builder;
import lombok.Getter;
import lombok.ToString;

@Getter
@ToString
public class CrawlingBook {

private String imageUrl;
private String title;
private String content;
private String isbn;
private String publisher;
private String imageUrl;
private String thumbnail;
private String[] authors;
private LocalDateTime dateTime;

@Builder
private CrawlingBook(String imageUrl, String content) {
this.imageUrl = imageUrl;
private CrawlingBook(String title, String content, String isbn, String publisher, String imageUrl, String thumbnail,
String authors, LocalDateTime dateTime) {
this.title = title;
this.content = content;
this.isbn = isbn;
this.publisher = publisher;
this.imageUrl = imageUrl;
this.thumbnail = thumbnail;
this.authors = parseAuthorsToArr(authors);
this.dateTime = dateTime;
}

public static CrawlingBook of(String imageUrl, String content) {
public static CrawlingBook of(String title, String content, String isbn, String publisher, String imageUrl,
String thumbnail, String authors, LocalDateTime dateTime) {
return CrawlingBook.builder()
.imageUrl(imageUrl)
.title(title)
.content(content)
.isbn(isbn)
.publisher(publisher)
.imageUrl(imageUrl)
.thumbnail(thumbnail)
.authors(authors)
.dateTime(dateTime)
.build();
}

public boolean isBlankContent() {
return this.content.isBlank();
}

private String[] parseAuthorsToArr(String authors) {
return authors.split(" 저| 공저| 글| 편저| 원저")[0].split(",");
}

}
1 change: 1 addition & 0 deletions src/main/java/com/jisungin/infra/crawler/Fetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ public interface Fetcher {

Document fetchIsbn(String isbn);
Document fetchBook(String bookId);
Document fetchBestSellerBookId();

}
2 changes: 2 additions & 0 deletions src/main/java/com/jisungin/infra/crawler/Parser.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package com.jisungin.infra.crawler;

import java.util.Map;
import org.jsoup.nodes.Document;

public interface Parser {

String parseIsbn(Document doc);
CrawlingBook parseBook(Document doc);
Map<Long, String> parseBestSellerBookId(Document doc);

}
19 changes: 19 additions & 0 deletions src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
package com.jisungin.infra.crawler;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;

Expand All @@ -17,4 +21,19 @@ public CrawlingBook crawlBook(String isbn) {
return parser.parseBook(fetcher.fetchBook(bookId));
}

@Override
public Map<Long, CrawlingBook> crawlBestSellerBook() {
Map<Long, String> bestSellerBookIds = parser.parseBestSellerBookId(fetcher.fetchBestSellerBookId());
Map<Long, CrawlingBook> bestSellerBooks = new HashMap<>();

List<CompletableFuture<Void>> futures = bestSellerBookIds.entrySet().stream()
.map(entry -> CompletableFuture.supplyAsync(() -> parser.parseBook(fetcher.fetchBook(entry.getValue())))
.thenAccept(crawlingBook -> bestSellerBooks.put(entry.getKey(), crawlingBook)))
.toList();

CompletableFuture.allOf(futures.toArray(CompletableFuture[]::new)).join();

return bestSellerBooks;
}

}
23 changes: 0 additions & 23 deletions src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java

This file was deleted.

44 changes: 39 additions & 5 deletions src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
package com.jisungin.infra.crawler;

import static com.jisungin.infra.crawler.Yes24CrawlerConstant.*;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.USER_AGENT;

import com.jisungin.exception.BusinessException;
import com.jisungin.exception.ErrorCode;
import java.net.SocketTimeoutException;
import lombok.Setter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;

@Component
@Setter
@ConfigurationProperties(prefix = "crawler.yes24.fetcher")
public class Yes24Fetcher implements Fetcher {

private String isbnUrl;
private String bookUrl;
private String bestBookUrl;
private String userAgent;

@Override
public Document fetchIsbn(String isbn) {
try {
return Jsoup.connect(getIsbnUrl(isbn))
.timeout(5000)
.userAgent(USER_AGENT)
.userAgent(userAgent)
.ignoreContentType(true)
.get();
} catch (SocketTimeoutException e) {
throw new BusinessException(ErrorCode.REQUEST_TIME_OUT);
} catch (Exception e) {
throw new BusinessException(ErrorCode.BOOK_NOT_FOUND);
}
Expand All @@ -30,12 +39,37 @@ public Document fetchBook(String bookId) {
try {
return Jsoup.connect(getBookUrl(bookId))
.timeout(5000)
.userAgent(USER_AGENT)
.userAgent(userAgent)
.ignoreContentType(true)
.get();
} catch (SocketTimeoutException e) {
throw new BusinessException(ErrorCode.REQUEST_TIME_OUT);
} catch (Exception e) {
throw new BusinessException(ErrorCode.BOOK_NOT_FOUND);
}
}

@Override
public Document fetchBestSellerBookId() {
try {
return Jsoup.connect(bestBookUrl)
.timeout(5000)
.userAgent(userAgent)
.ignoreContentType(true)
.get();
} catch (SocketTimeoutException e) {
throw new BusinessException(ErrorCode.REQUEST_TIME_OUT);
} catch (Exception e) {
throw new BusinessException(ErrorCode.BOOK_NOT_FOUND);
}
}

private String getIsbnUrl(String isbn) {
return isbnUrl + isbn;
}

private String getBookUrl(String bookId) {
return bookUrl + bookId;
}

}
69 changes: 59 additions & 10 deletions src/main/java/com/jisungin/infra/crawler/Yes24Parser.java
Original file line number Diff line number Diff line change
@@ -1,29 +1,78 @@
package com.jisungin.infra.crawler;

import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_CONTENT_CSS;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_ATTR;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_CSS;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_ATTR;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_CSS;

import com.jayway.jsonpath.JsonPath;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import lombok.Setter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;

@Component
@Setter
@ConfigurationProperties(prefix = "crawler.yes24.parser")
public class Yes24Parser implements Parser {

private String isbnCss;
private String isbnAttr;
private String bookContentCss;
private String bookJsonCss;
private String bestRankingCss;
private String bestIdCss;
private String bestIdAttrs;

@Override
public String parseIsbn(Document doc) {
return doc.select(ISBN_CSS).attr(ISBN_ATTR);
return doc.select(isbnCss).attr(isbnAttr);
}

@Override
public CrawlingBook parseBook(Document doc) {
String image = doc.select(BOOK_IMAGE_CSS).attr(BOOK_IMAGE_ATTR);
String content = Jsoup.clean(doc.select(BOOK_CONTENT_CSS).text(), Safelist.none());
String json = doc.select(bookJsonCss).html();

String title = parseJsonToString(json, "$.name");
String isbn = parseJsonToString(json, "$.workExample[0].isbn");
String imageUrl = parseJsonToString(json, "$.image");
String publisher = parseJsonToString(json, "$.publisher.name");
String authors = parseJsonToString(json, "$.author.name");
String thumbnail = imageUrl.replace("XL", "M");
String content = Jsoup.clean(doc.select(bookContentCss).text(), Safelist.none());
LocalDateTime dateTime = parseDate(parseJsonToString(json, "$.workExample[0].datePublished"));

return CrawlingBook.of(title, content, isbn, publisher, imageUrl, thumbnail, authors, dateTime);
}

@Override
public Map<Long, String> parseBestSellerBookId(Document doc) {
Elements rankings = doc.select(bestRankingCss);
List<String> bookIds = doc.select(bestIdCss)
.eachAttr(bestIdAttrs);

return IntStream.range(0, rankings.size())
.boxed()
.collect(Collectors.toMap(
i -> parseRanking(rankings.get(i)),
bookIds::get));
}

private Long parseRanking(Element rankingElement) {
return Long.parseLong(rankingElement.text());
}

private String parseJsonToString(String json, String path) {
return JsonPath.read(json, path);
}

return CrawlingBook.of(image, content);
private LocalDateTime parseDate(String dateString) {
return LocalDate.parse(dateString).atStartOfDay();
}

}
3 changes: 2 additions & 1 deletion src/main/resources/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ spring:
prod-env:
- prod
include:
oauth
- oauth
- crawler
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ public void createBook() {
.build();

when(crawler.crawlBook(request.getIsbn()))
.thenReturn(CrawlingBook.of("도서 imageUrl", "도서 내용"));
.thenReturn(CrawlingBook.of("도서 제목", "도서 내용", "123456789X", "도서 출판사",
"도서 imageUrl", "도서 썸네일", "도서 저자1, 도서 저자2", registeredDateTime));

// when
BookResponse response = bookService.createBook(request);
Expand Down
Loading

0 comments on commit 555e2c9

Please sign in to comment.