Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] 베스트 셀러 크롤링 기능 추가 #42

Merged
merged 4 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/jisungin_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ jobs:
DEV_SECRET_DIR_FILE_NAME: application-oauth.yml
run: echo $DEV_SECRET | base64 --decode >> $DEV_SECRET_DIR/$DEV_SECRET_DIR_FILE_NAME

# application-crawler.yml
- name: Copy crawler Secret
env:
DEV_SECRET: ${{ secrets.APPLICATION_CRAWLER_YML }}
DEV_SECRET_DIR: src/main/resources
DEV_SECRET_DIR_FILE_NAME: application-crawler.yml
run: echo $DEV_SECRET | base64 --decode >> $DEV_SECRET_DIR/$DEV_SECRET_DIR_FILE_NAME

# application-jwt.yml
# - name: Copy jwt Secret
# env:
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ out/
application-dev.yml
application-prod.yml
application-jwt.yml
application-oauth.yml
application-oauth.yml
application-crawler.yml
3 changes: 2 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ dependencies {
runtimeOnly 'io.netty:netty-resolver-dns-native-macos:4.1.104.Final:osx-aarch_64'
// Jsoup Web Crawling Library
implementation 'org.jsoup:jsoup:1.16.2'

// JsonPath Parse Json Library
implementation 'com.jayway.jsonpath:json-path:2.9.0'
}

tasks.named('bootBuildImage') {
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/com/jisungin/exception/ErrorCode.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ public enum ErrorCode {
TALK_ROOM_NOT_FOUND(400, "토크방을 찾을 수 없습니다."),
UNAUTHORIZED_REQUEST(400, "권한이 없는 사용자입니다."),
COMMENT_NOT_FOUND(404, "의견을 찾을 수 없습니다."),
REVIEW_NOT_FOUND(404, "리뷰를 찾을 수 없습니다.");
REVIEW_NOT_FOUND(404, "리뷰를 찾을 수 없습니다."),
REQUEST_TIME_OUT(408, "요청 시간이 만료 되었습니다.");


private final int code;
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/com/jisungin/infra/crawler/Crawler.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package com.jisungin.infra.crawler;

import java.util.Map;

public interface Crawler {

CrawlingBook crawlBook(String isbn);
Map<Long, CrawlingBook> crawlBestSellerBook();

}
37 changes: 32 additions & 5 deletions src/main/java/com/jisungin/infra/crawler/CrawlingBook.java
Original file line number Diff line number Diff line change
@@ -1,29 +1,56 @@
package com.jisungin.infra.crawler;

import java.time.LocalDateTime;
import lombok.Builder;
import lombok.Getter;
import lombok.ToString;

@Getter
@ToString
public class CrawlingBook {

private String imageUrl;
private String title;
private String content;
private String isbn;
private String publisher;
private String imageUrl;
private String thumbnail;
private String[] authors;
private LocalDateTime dateTime;

@Builder
private CrawlingBook(String imageUrl, String content) {
this.imageUrl = imageUrl;
private CrawlingBook(String title, String content, String isbn, String publisher, String imageUrl, String thumbnail,
String authors, LocalDateTime dateTime) {
this.title = title;
this.content = content;
this.isbn = isbn;
this.publisher = publisher;
this.imageUrl = imageUrl;
this.thumbnail = thumbnail;
this.authors = parseAuthorsToArr(authors);
this.dateTime = dateTime;
}

public static CrawlingBook of(String imageUrl, String content) {
public static CrawlingBook of(String title, String content, String isbn, String publisher, String imageUrl,
String thumbnail, String authors, LocalDateTime dateTime) {
return CrawlingBook.builder()
.imageUrl(imageUrl)
.title(title)
.content(content)
.isbn(isbn)
.publisher(publisher)
.imageUrl(imageUrl)
.thumbnail(thumbnail)
.authors(authors)
.dateTime(dateTime)
.build();
}

public boolean isBlankContent() {
return this.content.isBlank();
}

private String[] parseAuthorsToArr(String authors) {
return authors.split(" 저| 공저| 글| 편저| 원저")[0].split(",");
}

}
1 change: 1 addition & 0 deletions src/main/java/com/jisungin/infra/crawler/Fetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ public interface Fetcher {

Document fetchIsbn(String isbn);
Document fetchBook(String bookId);
Document fetchBestSellerBookId();

}
2 changes: 2 additions & 0 deletions src/main/java/com/jisungin/infra/crawler/Parser.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package com.jisungin.infra.crawler;

import java.util.Map;
import org.jsoup.nodes.Document;

public interface Parser {

String parseIsbn(Document doc);
CrawlingBook parseBook(Document doc);
Map<Long, String> parseBestSellerBookId(Document doc);

}
19 changes: 19 additions & 0 deletions src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
package com.jisungin.infra.crawler;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;

Expand All @@ -17,4 +21,19 @@ public CrawlingBook crawlBook(String isbn) {
return parser.parseBook(fetcher.fetchBook(bookId));
}

@Override
public Map<Long, CrawlingBook> crawlBestSellerBook() {
Map<Long, String> bestSellerBookIds = parser.parseBestSellerBookId(fetcher.fetchBestSellerBookId());
Map<Long, CrawlingBook> bestSellerBooks = new HashMap<>();

List<CompletableFuture<Void>> futures = bestSellerBookIds.entrySet().stream()
.map(entry -> CompletableFuture.supplyAsync(() -> parser.parseBook(fetcher.fetchBook(entry.getValue())))
.thenAccept(crawlingBook -> bestSellerBooks.put(entry.getKey(), crawlingBook)))
.toList();

CompletableFuture.allOf(futures.toArray(CompletableFuture[]::new)).join();

return bestSellerBooks;
}

}
23 changes: 0 additions & 23 deletions src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java

This file was deleted.

44 changes: 39 additions & 5 deletions src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
package com.jisungin.infra.crawler;

import static com.jisungin.infra.crawler.Yes24CrawlerConstant.*;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.USER_AGENT;

import com.jisungin.exception.BusinessException;
import com.jisungin.exception.ErrorCode;
import java.net.SocketTimeoutException;
import lombok.Setter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;

@Component
@Setter
@ConfigurationProperties(prefix = "crawler.yes24.fetcher")
public class Yes24Fetcher implements Fetcher {

private String isbnUrl;
private String bookUrl;
private String bestBookUrl;
private String userAgent;

Comment on lines 12 to +21
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

크롤링을 위한 설정값을 바인딩 하기 위해 @Setter를 쓰신 건가요 ??
다른 방법은 없을까요 ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

네 맞습니다. @ConfigurationProperties는 yaml 파일에 있는 값을 동적으로 변경할 수 있습니다. 그렇기 때문에 생성된 빈에 대해서는 @Setter를 통해서 yaml 파일의 값을 가져야 설정할 수 있습니다.

다른 방법이라 하면 @Value를 통해서 값을 주입할 수 있지만 필드 하나마다 경로를 설정해야 하는 단점이 있지만 주입받는 필드를 final로 설정할 수 있다는 장점이 있습니다.

현재는 주입받는 필드가 많아 @ConfigurationProperties를 사용하였습니다.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

오 그렇군요 ! 감사합니다.

@Override
public Document fetchIsbn(String isbn) {
try {
return Jsoup.connect(getIsbnUrl(isbn))
.timeout(5000)
.userAgent(USER_AGENT)
.userAgent(userAgent)
.ignoreContentType(true)
.get();
} catch (SocketTimeoutException e) {
throw new BusinessException(ErrorCode.REQUEST_TIME_OUT);
} catch (Exception e) {
throw new BusinessException(ErrorCode.BOOK_NOT_FOUND);
}
Expand All @@ -30,12 +39,37 @@ public Document fetchBook(String bookId) {
try {
return Jsoup.connect(getBookUrl(bookId))
.timeout(5000)
.userAgent(USER_AGENT)
.userAgent(userAgent)
.ignoreContentType(true)
.get();
} catch (SocketTimeoutException e) {
throw new BusinessException(ErrorCode.REQUEST_TIME_OUT);
} catch (Exception e) {
throw new BusinessException(ErrorCode.BOOK_NOT_FOUND);
}
}

@Override
public Document fetchBestSellerBookId() {
try {
return Jsoup.connect(bestBookUrl)
.timeout(5000)
.userAgent(userAgent)
.ignoreContentType(true)
.get();
} catch (SocketTimeoutException e) {
throw new BusinessException(ErrorCode.REQUEST_TIME_OUT);
} catch (Exception e) {
throw new BusinessException(ErrorCode.BOOK_NOT_FOUND);
}
}

private String getIsbnUrl(String isbn) {
return isbnUrl + isbn;
}

private String getBookUrl(String bookId) {
return bookUrl + bookId;
}

}
69 changes: 59 additions & 10 deletions src/main/java/com/jisungin/infra/crawler/Yes24Parser.java
Original file line number Diff line number Diff line change
@@ -1,29 +1,78 @@
package com.jisungin.infra.crawler;

import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_CONTENT_CSS;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_ATTR;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_CSS;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_ATTR;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_CSS;

import com.jayway.jsonpath.JsonPath;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import lombok.Setter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;

@Component
@Setter
@ConfigurationProperties(prefix = "crawler.yes24.parser")
public class Yes24Parser implements Parser {

private String isbnCss;
private String isbnAttr;
private String bookContentCss;
private String bookJsonCss;
private String bestRankingCss;
private String bestIdCss;
private String bestIdAttrs;

@Override
public String parseIsbn(Document doc) {
return doc.select(ISBN_CSS).attr(ISBN_ATTR);
return doc.select(isbnCss).attr(isbnAttr);
}

@Override
public CrawlingBook parseBook(Document doc) {
String image = doc.select(BOOK_IMAGE_CSS).attr(BOOK_IMAGE_ATTR);
String content = Jsoup.clean(doc.select(BOOK_CONTENT_CSS).text(), Safelist.none());
String json = doc.select(bookJsonCss).html();

String title = parseJsonToString(json, "$.name");
String isbn = parseJsonToString(json, "$.workExample[0].isbn");
String imageUrl = parseJsonToString(json, "$.image");
String publisher = parseJsonToString(json, "$.publisher.name");
String authors = parseJsonToString(json, "$.author.name");
String thumbnail = imageUrl.replace("XL", "M");
String content = Jsoup.clean(doc.select(bookContentCss).text(), Safelist.none());
LocalDateTime dateTime = parseDate(parseJsonToString(json, "$.workExample[0].datePublished"));

return CrawlingBook.of(title, content, isbn, publisher, imageUrl, thumbnail, authors, dateTime);
}

@Override
public Map<Long, String> parseBestSellerBookId(Document doc) {
Elements rankings = doc.select(bestRankingCss);
List<String> bookIds = doc.select(bestIdCss)
.eachAttr(bestIdAttrs);

return IntStream.range(0, rankings.size())
.boxed()
.collect(Collectors.toMap(
i -> parseRanking(rankings.get(i)),
bookIds::get));
}

private Long parseRanking(Element rankingElement) {
return Long.parseLong(rankingElement.text());
}

private String parseJsonToString(String json, String path) {
return JsonPath.read(json, path);
}

return CrawlingBook.of(image, content);
private LocalDateTime parseDate(String dateString) {
return LocalDate.parse(dateString).atStartOfDay();
}

}
3 changes: 2 additions & 1 deletion src/main/resources/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ spring:
prod-env:
- prod
include:
oauth
- oauth
- crawler
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ public void createBook() {
.build();

when(crawler.crawlBook(request.getIsbn()))
.thenReturn(CrawlingBook.of("도서 imageUrl", "도서 내용"));
.thenReturn(CrawlingBook.of("도서 제목", "도서 내용", "123456789X", "도서 출판사",
"도서 imageUrl", "도서 썸네일", "도서 저자1, 도서 저자2", registeredDateTime));

// when
BookResponse response = bookService.createBook(request);
Expand Down
Loading
Loading