Skip to content
This repository was archived by the owner on Mar 9, 2022. It is now read-only.

Commit ab33e43

Browse files
committed
Linux crawler book.
1 parent baf05e9 commit ab33e43

File tree

7 files changed

+127
-40
lines changed

7 files changed

+127
-40
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ This project includes but is not limited to the following:
3535
- [x] Chinese characters, idioms, and after-sentence queries
3636
- [x] Crawler example: Crawling the iOS job information to get the crawl results
3737
- [x] Example of crawling novels: mortal cultivation into a biography of God
38-
- [x] **Python** interaction: `Swift` calls native `Python(.py)` with parameter interaction examples;
38+
- [x] **Python** interaction: `Swift` calls native `Python(.py)` with parameter interaction examples
3939
- [x] Example of mail delivery
4040
- [x] HTML display example.
4141

VaporServer/Package.resolved

Lines changed: 9 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

VaporServer/Package.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,13 @@ let package = Package(
2727
.package(url: "https://github.com/vapor/redis.git", from: "3.0.0-rc"),
2828

2929
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "1.7.1"),
30+
.package(url: "https://github.com/PerfectSideRepos/Perfect-ICONV.git",from:"3.0.1")
3031
],
3132
targets: [
3233
.target(name: "App", dependencies: ["SwiftSMTP",
3334
"Leaf",
3435
"FluentPostgreSQL",
35-
// "FluentMySQL",
36+
"PerfectICONV",
3637
"Vapor",
3738
"JWTMiddleware",
3839
"JWT",
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
2+
<!DOCTYPE html>
3+
<html>
4+
5+
<head>
6+
<title>#(bookName)</title>
7+
<style>
8+
*{
9+
background: #ffffff;
10+
padding: left:15,right:15,top:15,bottom:15;
11+
}
12+
.container {
13+
margin: auto;
14+
position: absolute;
15+
text-overline-color: #f2f2f2;
16+
}
17+
</style>
18+
</head>
19+
20+
<div class="container">
21+
<body> <h2> #(chaptName),#(time) </h2> </body>
22+
<body> <h4> #(content) </h4> </body>
23+
</div>
24+
</html>

VaporServer/Resources/Views/leaf/login.leaf

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,9 @@
44
<head>
55
<meta charset="UTF-8">
66
<title>Login</title>
7-
8-
9-
107
<link rel="stylesheet" href="/js/login/css/style.css">
11-
128
</head>
139

14-
1510
<body>
1611

1712
<div class="wrapper">
@@ -40,13 +35,9 @@
4035
</div>
4136
<script src='http://cdnjs.cloudflare.com/ajax/libs/jquery/2.1.3/jquery.min.js'></script>
4237

43-
44-
4538
<script src="/js/login/js/index.js"></script>
4639

4740

48-
49-
5041
</body>
5142

5243
</html>

VaporServer/Sources/App/Controllers/BookController.swift

Lines changed: 88 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import Vapor
1010
import Fluent
1111
import FluentPostgreSQL
1212
import SwiftSoup
13+
import PerfectICONV
1314

1415
private let header: HTTPHeaders = ["User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
1516
,"Cookie": "yunsuo_session_verify=2a87ab507187674302f32bbc33248656"]
@@ -30,6 +31,9 @@ class BookController: RouteCollection {
3031
group.get("story", use: getBookLastChapterContentHandler)
3132
group.get("start", use: crawlerFanRenBookHandler)
3233

34+
//test
35+
group.get("html", use: getHtmlDataHandler)
36+
3337
}
3438
}
3539

@@ -38,26 +42,70 @@ extension BookController {
3842
func getBookLastChapterContentHandler(_ req: Request) throws -> Future<Response> {
3943
let name = req.query[String.self,at:"name"] ?? ""
4044

41-
return BookChapter.query(on: req).filter(\.bookName,.like,"%\(name)%").all().flatMap({ (books) in
42-
if books.count > 0 {
43-
return try ResponseJSON<BookChapter>(data: books.last).encode(for: req)
44-
}else {
45-
return try ResponseJSON<BookChapter>(status: .error, message: "没有此书: \(name)").encode(for: req)
46-
}
47-
})
45+
return BookInfo.query(on: req)
46+
.filter(\.bookName,.like,"%\(name)%")
47+
.first()
48+
.flatMap({ (info) in
49+
guard let info = info else {
50+
return try ResponseJSON<Empty>(status: .error, message: "没有此书: \(name)").encode(for: req)
51+
}
52+
return BookChapter
53+
.query(on: req)
54+
.filter(\.bookId == info.bookId)
55+
.sort(\.chapterId,.descending)
56+
.first()
57+
.flatMap({ (chapter) in
58+
struct Chapter: Content {
59+
var bookName: String?
60+
var time: String?
61+
var chaptName: String?
62+
var content: String?
63+
}
64+
let pter = Chapter(bookName: info.bookName, time: chapter?.updateTime, chaptName: "最新章节:" + (chapter?.chapterName ?? ""), content: chapter?.content)
65+
return try req.view().render("leaf/book",pter).encode(for: req)
66+
})
67+
})
68+
4869
}
4970

50-
func crawlerFanRenBookHandler(_ req: Request) throws -> Future<Response> {
71+
func getHtmlDataHandler(_ req: Request) throws -> Future<String> {
72+
73+
let client = try req.make(Client.self)
74+
let iconv = try Iconv(from: Iconv.CodePage.GBK, to: Iconv.CodePage.UTF8)
75+
let url = "https://www.piaotian.com/html/9/9102/"
76+
return client.get(url)
77+
.flatMap { $0.http.body.consumeData(on: req) } // 1) read complete body as raw Data
78+
.map { (data: Data) -> String in
79+
// 2) convert Data to [UInt8] for Iconv
80+
var bytes = [UInt8](repeating: 0, count: data.count)
81+
let buffer = UnsafeMutableBufferPointer(start: &bytes, count: bytes.count)
82+
_ = data.copyBytes(to: buffer)
83+
84+
// 3) convert GBK -> UTF8
85+
return iconv.utf8(buf: bytes) ?? ""
86+
}
87+
88+
}
89+
90+
func crawlerFanRenBookHandler(_ req: Request) throws -> Future<ResponseJSON<Empty>> {
5191

5292
typeId = 9
5393
bookId = 9102
5494
let url = "https://www.piaotian.com/html/\(typeId)/\(bookId)/"
5595

56-
let client = try req.make(FoundationClient.self)
57-
return client.get(url,headers: header)
58-
.flatMap(to: Response.self, { clientResponse in
96+
97+
return try req.client().get(url)
98+
.flatMap { $0.http.body.consumeData(on: req) }
99+
.map { (data) -> String in
100+
let iconv = try Iconv(from: Iconv.CodePage.GBK, to: Iconv.CodePage.UTF8)
101+
var bytes = [UInt8](repeating: 0, count: data.count)
102+
let buffer = UnsafeMutableBufferPointer(start: &bytes, count: bytes.count)
103+
_ = data.copyBytes(to: buffer)
104+
105+
return iconv.utf8(buf: bytes) ?? "g/u"
106+
}.map({ html -> ResponseJSON<Empty> in
59107

60-
let html = clientResponse.http.body.gbkString
108+
print(html,"\n\n\n")
61109
let document = try SwiftSoup.parse(html)
62110
let mainBody = try document.select("div[class='mainbody']")
63111

@@ -77,21 +125,14 @@ extension BookController {
77125
self.elements = revertLis
78126
self.currentIndex = 0
79127

80-
_ = BookInfo.query(on: req).filter(\.bookId == self.bookId).first().map({ (exist) in
81-
82-
if var exist = exist {
83-
exist.chapterCount = revertLis.count
84-
exist.updateTime = TimeManager.currentTime()
85-
_ = exist.update(on: req)
86-
debugPrint("本书已存在:\(exist.bookName ?? "") \(TimeManager.currentTime())")
87-
}else {
88-
let bookInfo = BookInfo(id: nil, typeId: self.typeId, bookId: self.bookId, bookName: bookName, chapterCount: revertLis.count, updateTime: TimeManager.currentTime(), content: nil, auther: auther,bookImg: nil)
89-
_ = bookInfo.save(on: req).map({ (info) in
90-
debugPrint("已保存本书:\(info)")
91-
})
92-
}
93-
94-
})
128+
try self.bookExistHandler(req,
129+
revertLis: revertLis,
130+
bookName: bookName,
131+
auther: auther)
132+
133+
if self.elements == nil || self.elements?.count == 0 {
134+
return ResponseJSON<Empty>(status: .error, message: "没有数据")
135+
}
95136

96137
func runRepeatTimer() throws {
97138

@@ -108,11 +149,30 @@ extension BookController {
108149
}
109150
try runRepeatTimer()
110151

111-
return try ResponseJSON<Empty>(status: .ok, message: "开始爬取 \(self.typeId)/\(self.bookId)").encode(for: req)
152+
return ResponseJSON<Empty>(status: .ok, message: "开始爬取 \(self.typeId)/\(self.bookId)")
112153
})
154+
113155
}
114156

115157

158+
func bookExistHandler(_ req: Request,revertLis: [Element],bookName: String, auther: String) throws {
159+
160+
_ = BookInfo.query(on: req).filter(\.bookId == self.bookId).first().map({ (exist) in
161+
162+
if var exist = exist {
163+
exist.chapterCount = revertLis.count
164+
exist.updateTime = TimeManager.currentTime()
165+
_ = exist.update(on: req)
166+
debugPrint("本书已存在:\(exist.bookName ?? "") \(TimeManager.currentTime())")
167+
}else {
168+
let bookInfo = BookInfo(id: nil, typeId: self.typeId, bookId: self.bookId, bookName: bookName, chapterCount: revertLis.count, updateTime: TimeManager.currentTime(), content: nil, auther: auther,bookImg: nil)
169+
_ = bookInfo.save(on: req).map({ (info) in
170+
debugPrint("已保存本书:\(info)")
171+
})
172+
}
173+
})
174+
}
175+
116176
// 保存每一章内容。
117177
func saveBookContentHandler(req: Request,bookName: String,auther: String,bookId: Int,typeId: Int) throws {
118178

VaporServer/Sources/App/Controllers/CrawlerController.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,16 +412,18 @@ struct LGDetailItem: Content {
412412
}
413413

414414
extension HTTPBody {
415+
415416
var utf8String: String {
416417
return String(data: data ?? Data(), encoding: .utf8) ?? "n/a"
417418
}
418419

419420
var gbkString: String {
420-
421+
421422
//获取GBK编码,使用GB18030是因为它向下兼容GBK
422423
let cfEncoding = CFStringEncodings.GB_18030_2000
423424
let encoding = CFStringConvertEncodingToNSStringEncoding(CFStringEncoding(cfEncoding.rawValue))
424425
//从GBK编码的Data里初始化NSString,返回的NSString是UTF-16编码
426+
425427
if let str = NSString(data: data ?? Data(), encoding: encoding) {
426428
return str as String
427429
} else {

0 commit comments

Comments
 (0)