Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 04978f9

Browse files
committedJul 22, 2021
Merge branch 'release/0.7.5'
2 parents 1068e18 + 113eaa4 commit 04978f9

File tree

23 files changed

+588
-174
lines changed

23 files changed

+588
-174
lines changed
 

‎README-zh.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
![logo](http://webmagic.io/images/logo.jpeg)
22

33

4+
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
5+
[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html)
46
[![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic)
57

6-
78
官方网站[http://webmagic.io/](http://webmagic.io/)
89

910
>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。
@@ -38,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
3839
<dependency>
3940
<groupId>us.codecraft</groupId>
4041
<artifactId>webmagic-core</artifactId>
41-
<version>0.7.4</version>
42+
<version>0.7.5</version>
4243
</dependency>
4344
<dependency>
4445
<groupId>us.codecraft</groupId>
4546
<artifactId>webmagic-extension</artifactId>
46-
<version>0.7.4</version>
47+
<version>0.7.5</version>
4748
</dependency>
4849
```
4950

‎README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
[Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md)
44

55

6+
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
7+
[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html)
68
[![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic)
79

810
>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.
@@ -23,12 +25,12 @@ Add dependencies to your pom.xml:
2325
<dependency>
2426
<groupId>us.codecraft</groupId>
2527
<artifactId>webmagic-core</artifactId>
26-
<version>0.7.4</version>
28+
<version>0.7.5</version>
2729
</dependency>
2830
<dependency>
2931
<groupId>us.codecraft</groupId>
3032
<artifactId>webmagic-extension</artifactId>
31-
<version>0.7.4</version>
33+
<version>0.7.5</version>
3234
</dependency>
3335
```
3436

‎pom.xml

Lines changed: 161 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
<?xml version="1.0" encoding="UTF-8"?>
22
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
33
<groupId>us.codecraft</groupId>
4-
<version>0.7.4</version>
4+
<version>0.7.5</version>
55
<modelVersion>4.0.0</modelVersion>
66
<packaging>pom</packaging>
77
<properties>
88
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
99
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
10-
<java.version>1.8</java.version>
10+
<maven.compiler.source>1.8</maven.compiler.source>
11+
<maven.compiler.target>1.8</maven.compiler.target>
1112
<spring-version>4.0.0.RELEASE</spring-version>
1213
</properties>
1314
<artifactId>webmagic-parent</artifactId>
@@ -33,7 +34,7 @@
3334
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
3435
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
3536
<url>git@github.com:code4craft/webmagic.git</url>
36-
<tag>webmagic-parent-0.6.1</tag>
37+
<tag>WebMagic-${project.version}</tag>
3738
</scm>
3839
<licenses>
3940
<license>
@@ -49,6 +50,7 @@
4950
<module>webmagic-selenium</module>
5051
<module>webmagic-saxon</module>
5152
<module>webmagic-samples</module>
53+
<module>webmagic-coverage</module>
5254
</modules>
5355

5456
<dependencyManagement>
@@ -73,17 +75,17 @@
7375
<dependency>
7476
<groupId>org.apache.httpcomponents</groupId>
7577
<artifactId>httpcore</artifactId>
76-
<version>4.4.13</version>
78+
<version>4.4.14</version>
7779
</dependency>
7880
<dependency>
7981
<groupId>com.google.guava</groupId>
8082
<artifactId>guava</artifactId>
81-
<version>30.0-android</version>
83+
<version>30.1-jre</version>
8284
</dependency>
8385
<dependency>
8486
<groupId>com.jayway.jsonpath</groupId>
8587
<artifactId>json-path</artifactId>
86-
<version>2.6.0</version>
88+
<version>2.5.0</version>
8789
</dependency>
8890
<dependency>
8991
<groupId>org.slf4j</groupId>
@@ -98,12 +100,12 @@
98100
<dependency>
99101
<groupId>us.codecraft</groupId>
100102
<artifactId>xsoup</artifactId>
101-
<version>0.3.1</version>
103+
<version>0.3.2</version>
102104
</dependency>
103105
<dependency>
104106
<groupId>com.alibaba</groupId>
105107
<artifactId>fastjson</artifactId>
106-
<version>1.2.69</version>
108+
<version>1.2.75</version>
107109
</dependency>
108110
<dependency>
109111
<groupId>com.github.dreamhead</groupId>
@@ -125,38 +127,33 @@
125127
<dependency>
126128
<groupId>org.assertj</groupId>
127129
<artifactId>assertj-core</artifactId>
128-
<version>3.16.1</version>
130+
<version>3.18.1</version>
129131
<scope>test</scope>
130132
</dependency>
131133
<dependency>
132134
<groupId>org.apache.commons</groupId>
133135
<artifactId>commons-lang3</artifactId>
134-
<version>3.10</version>
136+
<version>3.11</version>
135137
</dependency>
136138
<dependency>
137139
<groupId>commons-collections</groupId>
138140
<artifactId>commons-collections</artifactId>
139141
<version>3.2.2</version>
140142
</dependency>
141143
<dependency>
142-
<groupId>commons-io</groupId>
143-
<artifactId>commons-io</artifactId>
144-
<version>2.7</version>
145-
</dependency>
144+
<groupId>commons-io</groupId>
145+
<artifactId>commons-io</artifactId>
146+
<version>2.8.0</version>
147+
</dependency>
146148
<dependency>
147149
<groupId>org.codehaus.groovy</groupId>
148150
<artifactId>groovy-all</artifactId>
149-
<version>2.4.19</version>
151+
<version>3.0.7</version>
150152
</dependency>
151153
<dependency>
152154
<groupId>org.jruby</groupId>
153155
<artifactId>jruby</artifactId>
154-
<version>9.2.11.1</version>
155-
</dependency>
156-
<dependency>
157-
<groupId>org.jsoup</groupId>
158-
<artifactId>jsoup</artifactId>
159-
<version>1.10.3</version>
156+
<version>9.2.14.0</version>
160157
</dependency>
161158
<dependency>
162159
<groupId>org.python</groupId>
@@ -171,12 +168,12 @@
171168
<dependency>
172169
<groupId>net.sf.saxon</groupId>
173170
<artifactId>Saxon-HE</artifactId>
174-
<version>10.1</version>
171+
<version>10.3</version>
175172
</dependency>
176173
<dependency>
177174
<groupId>net.sourceforge.htmlcleaner</groupId>
178175
<artifactId>htmlcleaner</artifactId>
179-
<version>2.5</version>
176+
<version>2.9</version>
180177
</dependency>
181178
<dependency>
182179
<groupId>com.github.detro</groupId>
@@ -191,7 +188,7 @@
191188
<dependency>
192189
<groupId>redis.clients</groupId>
193190
<artifactId>jedis</artifactId>
194-
<version>2.9.3</version>
191+
<version>3.6.0</version>
195192
</dependency>
196193
</dependencies>
197194
</dependencyManagement>
@@ -211,7 +208,7 @@
211208
<configuration>
212209
<rules>
213210
<requireMavenVersion>
214-
<version>3.0.5</version>
211+
<version>3.3.9</version>
215212
</requireMavenVersion>
216213
</rules>
217214
</configuration>
@@ -221,19 +218,10 @@
221218
<plugin>
222219
<groupId>org.apache.maven.plugins</groupId>
223220
<artifactId>maven-surefire-plugin</artifactId>
224-
<version>3.0.0-M4</version>
225-
<configuration>
226-
<forkCount>0</forkCount>
227-
</configuration>
228221
</plugin>
229222
<plugin>
230223
<groupId>org.apache.maven.plugins</groupId>
231224
<artifactId>maven-compiler-plugin</artifactId>
232-
<version>3.8.1</version>
233-
<configuration>
234-
<source>${java.version}</source>
235-
<target>${java.version}</target>
236-
</configuration>
237225
</plugin>
238226
<!--<plugin>-->
239227
<!--<groupId>org.apache.maven.plugins</groupId>-->
@@ -258,12 +246,10 @@
258246
<plugin>
259247
<groupId>org.apache.maven.plugins</groupId>
260248
<artifactId>maven-resources-plugin</artifactId>
261-
<version>3.1.0</version>
262249
</plugin>
263250
<plugin>
264251
<groupId>org.apache.maven.plugins</groupId>
265252
<artifactId>maven-jar-plugin</artifactId>
266-
<version>3.2.0</version>
267253
<configuration>
268254
<excludes>
269255
<exclude>log4j.xml</exclude>
@@ -289,7 +275,7 @@
289275
<version>3.2.0</version>
290276
<configuration>
291277
<encoding>UTF-8</encoding>
292-
<doctitle>WebMagic 0.7.4</doctitle>
278+
<doctitle>WebMagic ${project.version}</doctitle>
293279
<locale>en_US</locale>
294280

295281
<!-- avoid the issue: https://bugs.openjdk.java.net/browse/JDK-8212233 -->
@@ -317,9 +303,147 @@
317303
<artifactId>maven-release-plugin</artifactId>
318304
<version>3.0.0-M1</version>
319305
</plugin>
306+
<plugin>
307+
<groupId>org.jacoco</groupId>
308+
<artifactId>jacoco-maven-plugin</artifactId>
309+
<executions>
310+
<execution>
311+
<goals>
312+
<goal>prepare-agent</goal>
313+
</goals>
314+
</execution>
315+
<execution>
316+
<id>report</id>
317+
<phase>verify</phase>
318+
<goals>
319+
<goal>report</goal>
320+
</goals>
321+
</execution>
322+
</executions>
323+
</plugin>
324+
<plugin>
325+
<groupId>com.amashchenko.maven.plugin</groupId>
326+
<artifactId>gitflow-maven-plugin</artifactId>
327+
<configuration>
328+
<gitFlowConfig>
329+
<versionTagPrefix>WebMagic-</versionTagPrefix>
330+
</gitFlowConfig>
331+
</configuration>
332+
</plugin>
320333
</plugins>
334+
<pluginManagement>
335+
<plugins>
336+
<plugin>
337+
<groupId>org.apache.maven.plugins</groupId>
338+
<artifactId>maven-clean-plugin</artifactId>
339+
<version>3.1.0</version>
340+
</plugin>
341+
<plugin>
342+
<groupId>org.apache.maven.plugins</groupId>
343+
<artifactId>maven-compiler-plugin</artifactId>
344+
<version>3.8.1</version>
345+
</plugin>
346+
<plugin>
347+
<groupId>org.apache.maven.plugins</groupId>
348+
<artifactId>maven-deploy-plugin</artifactId>
349+
<version>3.0.0-M1</version>
350+
</plugin>
351+
<plugin>
352+
<groupId>org.apache.maven.plugins</groupId>
353+
<artifactId>maven-install-plugin</artifactId>
354+
<version>3.0.0-M1</version>
355+
</plugin>
356+
<plugin>
357+
<groupId>org.apache.maven.plugins</groupId>
358+
<artifactId>maven-jar-plugin</artifactId>
359+
<version>3.2.0</version>
360+
</plugin>
361+
<plugin>
362+
<groupId>org.apache.maven.plugins</groupId>
363+
<artifactId>maven-jxr-plugin</artifactId>
364+
<version>3.1.1</version>
365+
</plugin>
366+
<plugin>
367+
<groupId>org.apache.maven.plugins</groupId>
368+
<artifactId>maven-pmd-plugin</artifactId>
369+
<version>3.14.0</version>
370+
</plugin>
371+
<plugin>
372+
<groupId>org.apache.maven.plugins</groupId>
373+
<artifactId>maven-resources-plugin</artifactId>
374+
<version>3.2.0</version>
375+
</plugin>
376+
<plugin>
377+
<groupId>org.apache.maven.plugins</groupId>
378+
<artifactId>maven-site-plugin</artifactId>
379+
<version>3.9.1</version>
380+
</plugin>
381+
<plugin>
382+
<groupId>org.apache.maven.plugins</groupId>
383+
<artifactId>maven-surefire-plugin</artifactId>
384+
<version>3.0.0-M5</version>
385+
</plugin>
386+
<plugin>
387+
<groupId>org.apache.maven.plugins</groupId>
388+
<artifactId>maven-surefire-report-plugin</artifactId>
389+
<version>3.0.0-M5</version>
390+
</plugin>
391+
<plugin>
392+
<groupId>org.codehaus.mojo</groupId>
393+
<artifactId>taglist-maven-plugin</artifactId>
394+
<version>2.4</version>
395+
</plugin>
396+
<plugin>
397+
<groupId>org.jacoco</groupId>
398+
<artifactId>jacoco-maven-plugin</artifactId>
399+
<version>0.8.7</version>
400+
</plugin>
401+
<plugin>
402+
<groupId>com.amashchenko.maven.plugin</groupId>
403+
<artifactId>gitflow-maven-plugin</artifactId>
404+
<version>1.15.0</version>
405+
</plugin>
406+
<plugin>
407+
<groupId>com.github.spotbugs</groupId>
408+
<artifactId>spotbugs-maven-plugin</artifactId>
409+
<version>4.2.3</version>
410+
</plugin>
411+
</plugins>
412+
</pluginManagement>
321413
</build>
322414

415+
<reporting>
416+
<plugins>
417+
<plugin>
418+
<groupId>org.apache.maven.plugins</groupId>
419+
<artifactId>maven-javadoc-plugin</artifactId>
420+
<configuration>
421+
<doclint>none</doclint>
422+
</configuration>
423+
</plugin>
424+
<plugin>
425+
<groupId>org.apache.maven.plugins</groupId>
426+
<artifactId>maven-jxr-plugin</artifactId>
427+
</plugin>
428+
<plugin>
429+
<groupId>org.apache.maven.plugins</groupId>
430+
<artifactId>maven-pmd-plugin</artifactId>
431+
</plugin>
432+
<plugin>
433+
<groupId>org.apache.maven.plugins</groupId>
434+
<artifactId>maven-surefire-report-plugin</artifactId>
435+
</plugin>
436+
<plugin>
437+
<groupId>org.codehaus.mojo</groupId>
438+
<artifactId>taglist-maven-plugin</artifactId>
439+
</plugin>
440+
<plugin>
441+
<groupId>com.github.spotbugs</groupId>
442+
<artifactId>spotbugs-maven-plugin</artifactId>
443+
</plugin>
444+
</plugins>
445+
</reporting>
446+
323447
<profiles>
324448
<profile>
325449
<id>release</id>

‎src/site/site.xml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<project xmlns="http://maven.apache.org/DECORATION/1.6.0"
2+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.6.0
4+
http://maven.apache.org/xsd/decoration-1.6.0.xsd">
5+
<skin>
6+
<groupId>org.apache.maven.skins</groupId>
7+
<artifactId>maven-fluido-skin</artifactId>
8+
<version>1.9</version>
9+
</skin>
10+
<body>
11+
<menu ref="parent" inherit="top" />
12+
<menu ref="modules" inherit="top" />
13+
<menu ref="reports" inherit="top" />
14+
</body>
15+
<custom>
16+
<fluidoSkin>
17+
<topBarEnabled>true</topBarEnabled>
18+
<sideBarEnabled>true</sideBarEnabled>
19+
<sourceLineNumbersEnabled>true</sourceLineNumbersEnabled>
20+
<copyrightClass>pull-right</copyrightClass>
21+
</fluidoSkin>
22+
</custom>
23+
</project>

‎webmagic-core/pom.xml

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<groupId>us.codecraft</groupId>
55
<artifactId>webmagic-parent</artifactId>
6-
<version>0.7.4</version>
6+
<version>0.7.5</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

@@ -61,11 +61,6 @@
6161
<artifactId>assertj-core</artifactId>
6262
</dependency>
6363

64-
<dependency>
65-
<groupId>org.jsoup</groupId>
66-
<artifactId>jsoup</artifactId>
67-
</dependency>
68-
6964
<dependency>
7065
<groupId>commons-io</groupId>
7166
<artifactId>commons-io</artifactId>

‎webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,8 @@ public Spider setScheduler(Scheduler scheduler) {
208208
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
209209
* @deprecated
210210
*/
211-
public Spider pipeline(Pipeline pipeline) {
211+
@Deprecated
212+
public Spider pipeline(Pipeline pipeline) {
212213
return addPipeline(pipeline);
213214
}
214215

@@ -258,7 +259,8 @@ public Spider clearPipeline() {
258259
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
259260
* @deprecated
260261
*/
261-
public Spider downloader(Downloader downloader) {
262+
@Deprecated
263+
public Spider downloader(Downloader downloader) {
262264
return setDownloader(downloader);
263265
}
264266

@@ -320,7 +322,7 @@ public void run() {
320322
processRequest(request);
321323
onSuccess(request);
322324
} catch (Exception e) {
323-
onError(request);
325+
onError(request, e);
324326
logger.error("process request " + request + " error", e);
325327
} finally {
326328
pageCount.incrementAndGet();
@@ -338,10 +340,19 @@ public void run() {
338340
logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
339341
}
340342

343+
/**
344+
* @deprecated Use {@link #onError(Request, Exception)} instead.
345+
*/
346+
@Deprecated
341347
protected void onError(Request request) {
348+
}
349+
350+
protected void onError(Request request, Exception e) {
351+
this.onError(request);
352+
342353
if (CollectionUtils.isNotEmpty(spiderListeners)) {
343354
for (SpiderListener spiderListener : spiderListeners) {
344-
spiderListener.onError(request);
355+
spiderListener.onError(request, e);
345356
}
346357
}
347358
}

‎webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,14 @@ public interface SpiderListener {
1010

1111
public void onSuccess(Request request);
1212

13+
/**
14+
* @deprecated Use {@link #onError(Request, Exception)} instead.
15+
*/
16+
@Deprecated
1317
public void onError(Request request);
18+
19+
default void onError(Request request, Exception e) {
20+
this.onError(request);
21+
}
22+
1423
}

‎webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
package us.codecraft.webmagic.selector;
22

3-
import org.jsoup.helper.StringUtil;
4-
import org.jsoup.nodes.Element;
5-
import org.jsoup.select.Elements;
6-
73
import java.util.ArrayList;
84
import java.util.List;
95

6+
import org.apache.commons.lang3.StringUtils;
7+
import org.jsoup.nodes.Element;
8+
import org.jsoup.select.Elements;
9+
1010
/**
1111
* Links selector based on jsoup. Use absolute url. <br>
1212
*
@@ -23,9 +23,9 @@ public String select(Element element) {
2323
@Override
2424
public List<String> selectList(Element element) {
2525
Elements elements = element.select("a");
26-
List<String> links = new ArrayList<String>(elements.size());
26+
List<String> links = new ArrayList<>(elements.size());
2727
for (Element element0 : elements) {
28-
if (!StringUtil.isBlank(element0.baseUri())) {
28+
if (StringUtils.isNotBlank(element0.baseUri())) {
2929
links.add(element0.attr("abs:href"));
3030
} else {
3131
links.add(element0.attr("href"));

‎webmagic-coverage/pom.xml

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
5+
http://maven.apache.org/maven-v4_0_0.xsd">
6+
<modelVersion>4.0.0</modelVersion>
7+
8+
<parent>
9+
<groupId>us.codecraft</groupId>
10+
<artifactId>webmagic-parent</artifactId>
11+
<version>0.7.5</version>
12+
</parent>
13+
14+
<artifactId>webmagic-coverage</artifactId>
15+
<packaging>pom</packaging>
16+
<name>webmagic-coverage</name>
17+
<description>Compute aggregated test code coverage</description>
18+
19+
<properties>
20+
<maven.deploy.skip>true</maven.deploy.skip>
21+
</properties>
22+
23+
<dependencies>
24+
<dependency>
25+
<groupId>${project.groupId}</groupId>
26+
<artifactId>webmagic-core</artifactId>
27+
<version>${project.version}</version>
28+
</dependency>
29+
<dependency>
30+
<groupId>${project.groupId}</groupId>
31+
<artifactId>webmagic-extension</artifactId>
32+
<version>${project.version}</version>
33+
</dependency>
34+
<dependency>
35+
<groupId>${project.groupId}</groupId>
36+
<artifactId>webmagic-scripts</artifactId>
37+
<version>${project.version}</version>
38+
</dependency>
39+
<dependency>
40+
<groupId>${project.groupId}</groupId>
41+
<artifactId>webmagic-selenium</artifactId>
42+
<version>${project.version}</version>
43+
</dependency>
44+
<dependency>
45+
<groupId>${project.groupId}</groupId>
46+
<artifactId>webmagic-saxon</artifactId>
47+
<version>${project.version}</version>
48+
</dependency>
49+
<dependency>
50+
<groupId>${project.groupId}</groupId>
51+
<artifactId>webmagic-samples</artifactId>
52+
<version>${project.version}</version>
53+
</dependency>
54+
</dependencies>
55+
56+
<reporting>
57+
<plugins>
58+
<plugin>
59+
<groupId>org.jacoco</groupId>
60+
<artifactId>jacoco-maven-plugin</artifactId>
61+
<reportSets>
62+
<reportSet>
63+
<reports>
64+
<report>report-aggregate</report>
65+
</reports>
66+
</reportSet>
67+
</reportSets>
68+
</plugin>
69+
</plugins>
70+
</reporting>
71+
72+
</project>

‎webmagic-extension/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<groupId>us.codecraft</groupId>
55
<artifactId>webmagic-parent</artifactId>
6-
<version>0.7.4</version>
6+
<version>0.7.5</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

‎webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderLi
6868
return new SpiderStatus(spider, monitorSpiderListener);
6969
}
7070

71+
protected List<SpiderStatusMXBean> getSpiderStatuses() {
72+
return this.spiderStatuses;
73+
}
74+
7175
public static SpiderMonitor instance() {
7276
return INSTANCE;
7377
}

‎webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,13 @@ public Date getStartTime() {
8484

8585
@Override
8686
public int getPagePerSecond() {
87-
int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
88-
return getSuccessPageCount() / runSeconds;
87+
if (getStartTime() != null) {
88+
int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
89+
if (runSeconds != 0) {
90+
return getSuccessPageCount() / runSeconds;
91+
}
92+
}
93+
return -1;
8994
}
9095

9196
}
Lines changed: 33 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
11
package us.codecraft.webmagic.scheduler;
22

3-
import com.alibaba.fastjson.JSON;
3+
import java.util.Set;
4+
45
import org.apache.commons.codec.digest.DigestUtils;
56
import org.apache.commons.lang3.StringUtils;
7+
8+
import com.alibaba.fastjson.JSON;
9+
610
import redis.clients.jedis.Jedis;
711
import redis.clients.jedis.JedisPool;
812
import us.codecraft.webmagic.Request;
913
import us.codecraft.webmagic.Task;
1014

11-
import java.util.Set;
12-
1315
/**
1416
* the redis scheduler with priority
1517
* @author sai
1618
* Created by sai on 16-5-27.
1719
*/
18-
public class RedisPriorityScheduler extends RedisScheduler
19-
{
20+
public class RedisPriorityScheduler extends RedisScheduler {
2021

2122
private static final String ZSET_PREFIX = "zset_";
2223

@@ -37,114 +38,84 @@ public RedisPriorityScheduler(JedisPool pool) {
3738
}
3839

3940
@Override
40-
protected void pushWhenNoDuplicate(Request request, Task task)
41-
{
42-
Jedis jedis = pool.getResource();
43-
try
44-
{
45-
if(request.getPriority() > 0)
41+
protected void pushWhenNoDuplicate(Request request, Task task) {
42+
try (Jedis jedis = pool.getResource()) {
43+
if (request.getPriority() > 0) {
4644
jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl());
47-
else if(request.getPriority() < 0)
45+
} else if (request.getPriority() < 0) {
4846
jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl());
49-
else
47+
} else {
5048
jedis.lpush(getQueueNoPriorityKey(task), request.getUrl());
49+
}
5150

5251
setExtrasInItem(jedis, request, task);
5352
}
54-
finally
55-
{
56-
pool.returnResource(jedis);
57-
}
5853
}
5954

6055
@Override
61-
public synchronized Request poll(Task task)
62-
{
63-
Jedis jedis = pool.getResource();
64-
try
65-
{
56+
public synchronized Request poll(Task task) {
57+
try (Jedis jedis = pool.getResource()) {
6658
String url = getRequest(jedis, task);
67-
if(StringUtils.isBlank(url))
59+
if (StringUtils.isBlank(url)) {
6860
return null;
61+
}
6962
return getExtrasInItem(jedis, url, task);
7063
}
71-
finally
72-
{
73-
pool.returnResource(jedis);
74-
}
7564
}
7665

77-
private String getRequest(Jedis jedis, Task task)
78-
{
66+
private String getRequest(Jedis jedis, Task task) {
7967
String url;
8068
Set<String> urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
81-
if(urls.isEmpty())
82-
{
69+
if (urls.isEmpty()) {
8370
url = jedis.lpop(getQueueNoPriorityKey(task));
84-
if(StringUtils.isBlank(url))
85-
{
71+
if (StringUtils.isBlank(url)) {
8672
urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
87-
if(!urls.isEmpty())
88-
{
73+
if (!urls.isEmpty()) {
8974
url = urls.toArray(new String[0])[0];
9075
jedis.zrem(getZsetMinusPriorityKey(task), url);
9176
}
9277
}
93-
}
94-
else
95-
{
78+
} else {
9679
url = urls.toArray(new String[0])[0];
9780
jedis.zrem(getZsetPlusPriorityKey(task), url);
9881
}
9982
return url;
10083
}
10184

10285
@Override
103-
public void resetDuplicateCheck(Task task)
104-
{
105-
Jedis jedis = pool.getResource();
106-
try
107-
{
86+
public void resetDuplicateCheck(Task task) {
87+
try (Jedis jedis = pool.getResource()) {
10888
jedis.del(getSetKey(task));
10989
}
110-
finally
111-
{
112-
pool.returnResource(jedis);
113-
}
11490
}
11591

116-
private String getZsetPlusPriorityKey(Task task)
117-
{
92+
private String getZsetPlusPriorityKey(Task task) {
11893
return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX;
11994
}
12095

121-
private String getQueueNoPriorityKey(Task task)
122-
{
96+
private String getQueueNoPriorityKey(Task task) {
12397
return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX;
12498
}
12599

126-
private String getZsetMinusPriorityKey(Task task)
127-
{
100+
private String getZsetMinusPriorityKey(Task task) {
128101
return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX;
129102
}
130103

131-
private void setExtrasInItem(Jedis jedis,Request request, Task task)
132-
{
133-
if(request.getExtras() != null)
134-
{
135-
String field = DigestUtils.shaHex(request.getUrl());
104+
private void setExtrasInItem(Jedis jedis,Request request, Task task) {
105+
if (request.getExtras() != null) {
106+
String field = DigestUtils.sha1Hex(request.getUrl());
136107
String value = JSON.toJSONString(request);
137108
jedis.hset(getItemKey(task), field, value);
138109
}
139110
}
140111

141-
private Request getExtrasInItem(Jedis jedis, String url, Task task)
142-
{
112+
private Request getExtrasInItem(Jedis jedis, String url, Task task) {
143113
String key = getItemKey(task);
144-
String field = DigestUtils.shaHex(url);
114+
String field = DigestUtils.sha1Hex(url);
145115
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
146-
if(bytes != null)
116+
if (bytes != null) {
147117
return JSON.parseObject(new String(bytes), Request.class);
118+
}
148119
return new Request(url);
149120
}
150121
}

‎webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java

Lines changed: 10 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package us.codecraft.webmagic.scheduler;
22

3-
import com.alibaba.fastjson.JSON;
43
import org.apache.commons.codec.digest.DigestUtils;
54
import org.apache.commons.lang3.StringUtils;
5+
6+
import com.alibaba.fastjson.JSON;
7+
68
import redis.clients.jedis.Jedis;
79
import redis.clients.jedis.JedisPool;
810
import redis.clients.jedis.JedisPoolConfig;
@@ -37,21 +39,15 @@ public RedisScheduler(JedisPool pool) {
3739

3840
@Override
3941
public void resetDuplicateCheck(Task task) {
40-
Jedis jedis = pool.getResource();
41-
try {
42+
try (Jedis jedis = pool.getResource()) {
4243
jedis.del(getSetKey(task));
43-
} finally {
44-
pool.returnResource(jedis);
4544
}
4645
}
4746

4847
@Override
4948
public boolean isDuplicate(Request request, Task task) {
50-
Jedis jedis = pool.getResource();
51-
try {
49+
try (Jedis jedis = pool.getResource()) {
5250
return jedis.sadd(getSetKey(task), request.getUrl()) == 0;
53-
} finally {
54-
pool.returnResource(jedis);
5551
}
5652

5753
}
@@ -62,7 +58,7 @@ protected void pushWhenNoDuplicate(Request request, Task task) {
6258
try {
6359
jedis.rpush(getQueueKey(task), request.getUrl());
6460
if (checkForAdditionalInfo(request)) {
65-
String field = DigestUtils.shaHex(request.getUrl());
61+
String field = DigestUtils.sha1Hex(request.getUrl());
6662
String value = JSON.toJSONString(request);
6763
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
6864
}
@@ -100,23 +96,20 @@ private boolean checkForAdditionalInfo(Request request) {
10096

10197
@Override
10298
public synchronized Request poll(Task task) {
103-
Jedis jedis = pool.getResource();
104-
try {
99+
try (Jedis jedis = pool.getResource()) {
105100
String url = jedis.lpop(getQueueKey(task));
106101
if (url == null) {
107102
return null;
108103
}
109104
String key = ITEM_PREFIX + task.getUUID();
110-
String field = DigestUtils.shaHex(url);
105+
String field = DigestUtils.sha1Hex(url);
111106
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
112107
if (bytes != null) {
113108
Request o = JSON.parseObject(new String(bytes), Request.class);
114109
return o;
115110
}
116111
Request request = new Request(url);
117112
return request;
118-
} finally {
119-
pool.returnResource(jedis);
120113
}
121114
}
122115

@@ -134,23 +127,17 @@ protected String getItemKey(Task task) {
134127

135128
@Override
136129
public int getLeftRequestsCount(Task task) {
137-
Jedis jedis = pool.getResource();
138-
try {
130+
try (Jedis jedis = pool.getResource()) {
139131
Long size = jedis.llen(getQueueKey(task));
140132
return size.intValue();
141-
} finally {
142-
pool.returnResource(jedis);
143133
}
144134
}
145135

146136
@Override
147137
public int getTotalRequestsCount(Task task) {
148-
Jedis jedis = pool.getResource();
149-
try {
138+
try (Jedis jedis = pool.getResource()) {
150139
Long size = jedis.scard(getSetKey(task));
151140
return size.intValue();
152-
} finally {
153-
pool.returnResource(jedis);
154141
}
155142
}
156143
}

‎webmagic-samples/pom.xml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>webmagic-parent</artifactId>
55
<groupId>us.codecraft</groupId>
6-
<version>0.7.4</version>
6+
<version>0.7.5</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

@@ -24,6 +24,26 @@
2424
<groupId>junit</groupId>
2525
<artifactId>junit</artifactId>
2626
</dependency>
27+
<dependency>
28+
<groupId>org.mapdb</groupId>
29+
<artifactId>mapdb</artifactId>
30+
<version>3.0.8</version>
31+
</dependency>
32+
<dependency>
33+
<groupId>com.fasterxml.jackson.core</groupId>
34+
<artifactId>jackson-core</artifactId>
35+
<version>2.13.0-rc1</version>
36+
</dependency>
37+
<dependency>
38+
<groupId>com.fasterxml.jackson.core</groupId>
39+
<artifactId>jackson-annotations</artifactId>
40+
<version>2.13.0-rc1</version>
41+
</dependency>
42+
<dependency>
43+
<groupId>com.fasterxml.jackson.core</groupId>
44+
<artifactId>jackson-databind</artifactId>
45+
<version>2.13.0-rc1</version>
46+
</dependency>
2747
</dependencies>
2848

2949
</project>
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
package us.codecraft.webmagic.recover;
2+
3+
import com.google.common.base.Charsets;
4+
import com.google.common.hash.BloomFilter;
5+
import com.google.common.hash.Funnels;
6+
import org.mapdb.DB;
7+
import org.mapdb.DBMaker;
8+
import org.mapdb.IndexTreeList;
9+
import org.mapdb.Serializer;
10+
import us.codecraft.webmagic.Request;
11+
import us.codecraft.webmagic.Task;
12+
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
13+
14+
import java.util.concurrent.atomic.AtomicInteger;
15+
16+
/**
17+
* @author :linweisen
18+
*/
19+
public class DuplicateStorageRemover implements DuplicateRemover {
20+
21+
private DB db;
22+
23+
private static String DATABASE_NAME = "duplicate";
24+
25+
private IndexTreeList<String> urlDuplicateQueue;
26+
27+
private BloomFilter<CharSequence> bloomFilter;
28+
29+
private AtomicInteger counter;
30+
31+
public DuplicateStorageRemover(String path) {
32+
33+
String duplicatStoragePath = path;
34+
35+
DB db = DBMaker.fileDB(duplicatStoragePath)
36+
.fileMmapEnableIfSupported()
37+
.fileMmapPreclearDisable()
38+
.cleanerHackEnable()
39+
.closeOnJvmShutdown()
40+
.transactionEnable()
41+
.concurrencyScale(128)
42+
.make();
43+
this.db = db;
44+
45+
this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen();
46+
47+
counter = new AtomicInteger(this.urlDuplicateQueue.size());
48+
this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
49+
for (String url : this.urlDuplicateQueue){
50+
bloomFilter.put(url);
51+
}
52+
53+
}
54+
55+
@Override
56+
public boolean isDuplicate(Request request, Task task) {
57+
String url = request.getUrl();
58+
boolean isDuplicate = bloomFilter.mightContain(url);
59+
if (!isDuplicate) {
60+
bloomFilter.put(url);
61+
urlDuplicateQueue.add(url);
62+
this.db.commit();
63+
counter.incrementAndGet();
64+
}
65+
return isDuplicate;
66+
}
67+
68+
@Override
69+
public void resetDuplicateCheck(Task task) {
70+
this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
71+
this.urlDuplicateQueue.clear();
72+
}
73+
74+
@Override
75+
public int getTotalRequestsCount(Task task) {
76+
return counter.get();
77+
}
78+
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
package us.codecraft.webmagic.recover;
2+
3+
import com.fasterxml.jackson.databind.ObjectMapper;
4+
import org.apache.commons.lang3.StringUtils;
5+
import org.mapdb.DB;
6+
import org.mapdb.DBMaker;
7+
import org.mapdb.IndexTreeList;
8+
import org.mapdb.Serializer;
9+
import us.codecraft.webmagic.Request;
10+
import us.codecraft.webmagic.Task;
11+
import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler;
12+
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
13+
14+
import java.io.IOException;
15+
16+
/**
17+
* @author :linweisen
18+
*/
19+
public class MmapQueueScheduler extends DuplicateRemovedScheduler {
20+
21+
private DB db;
22+
23+
private static String DATABASE_NAME = "queue";
24+
25+
private IndexTreeList<String> queue;
26+
27+
private static ObjectMapper mapper;
28+
29+
public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) {
30+
super.setDuplicateRemover(duplicateRemover);
31+
32+
String queuePath = path;
33+
34+
DB db = DBMaker.fileDB(queuePath)
35+
.fileMmapEnableIfSupported()
36+
.fileMmapPreclearDisable()
37+
.cleanerHackEnable()
38+
.closeOnJvmShutdown()
39+
.transactionEnable()
40+
.concurrencyScale(128)
41+
.make();
42+
this.db = db;
43+
this.mapper = new ObjectMapper();
44+
this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen();
45+
}
46+
47+
@Override
48+
public Request poll(Task task) {
49+
if (this.queue.size() > 0){
50+
String s = queue.remove(0);
51+
return fromJson(s, Request.class);
52+
}else{
53+
return null;
54+
}
55+
56+
}
57+
58+
@Override
59+
public void pushWhenNoDuplicate(Request request, Task task) {
60+
queue.add(toJson(request));
61+
this.db.commit();
62+
}
63+
64+
public String toJson(Object object) {
65+
try {
66+
return mapper.writeValueAsString(object);
67+
} catch (IOException e) {
68+
logger.warn("write to json string error:" + object, e);
69+
return null;
70+
}
71+
}
72+
73+
public <T> T fromJson(String jsonString, Class<T> clazz) {
74+
if (StringUtils.isEmpty(jsonString)) {
75+
return null;
76+
}
77+
try {
78+
return mapper.readValue(jsonString, clazz);
79+
} catch (IOException e) {
80+
logger.warn("parse json string error:" + jsonString, e);
81+
return null;
82+
}
83+
}
84+
85+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package us.codecraft.webmagic.recover;
2+
3+
4+
import us.codecraft.webmagic.Spider;
5+
import us.codecraft.webmagic.samples.SinaBlogProcessor;
6+
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
7+
8+
/**
9+
* @author code4crafter@gmail.com <br>
10+
*/
11+
public class RecoverSample {
12+
13+
public static void main(String[] args) {
14+
String storage = "queue";
15+
String duplicate = "duplicate";
16+
Spider spider = new Spider(new SinaBlogProcessor());
17+
DuplicateRemover remover = new DuplicateStorageRemover(duplicate);
18+
spider.setScheduler(new MmapQueueScheduler(remover, storage));
19+
spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
20+
.run();
21+
}
22+
}

‎webmagic-saxon/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>webmagic-parent</artifactId>
55
<groupId>us.codecraft</groupId>
6-
<version>0.7.4</version>
6+
<version>0.7.5</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

‎webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,11 @@
11
package us.codecraft.webmagic.selector;
22

3-
import net.sf.saxon.lib.NamespaceConstant;
4-
import net.sf.saxon.xpath.XPathEvaluator;
5-
import org.htmlcleaner.CleanerProperties;
6-
import org.htmlcleaner.DomSerializer;
7-
import org.htmlcleaner.HtmlCleaner;
8-
import org.htmlcleaner.TagNode;
9-
import org.slf4j.Logger;
10-
import org.slf4j.LoggerFactory;
11-
import org.w3c.dom.Document;
12-
import org.w3c.dom.Node;
13-
import org.w3c.dom.NodeList;
3+
import java.io.StringWriter;
4+
import java.util.ArrayList;
5+
import java.util.Iterator;
6+
import java.util.List;
7+
import java.util.Map;
8+
import java.util.concurrent.ConcurrentHashMap;
149

1510
import javax.xml.namespace.NamespaceContext;
1611
import javax.xml.transform.OutputKeys;
@@ -21,12 +16,19 @@
2116
import javax.xml.xpath.XPathConstants;
2217
import javax.xml.xpath.XPathExpression;
2318
import javax.xml.xpath.XPathExpressionException;
24-
import java.io.StringWriter;
25-
import java.util.ArrayList;
26-
import java.util.Iterator;
27-
import java.util.List;
28-
import java.util.Map;
29-
import java.util.concurrent.ConcurrentHashMap;
19+
20+
import org.htmlcleaner.CleanerProperties;
21+
import org.htmlcleaner.DomSerializer;
22+
import org.htmlcleaner.HtmlCleaner;
23+
import org.htmlcleaner.TagNode;
24+
import org.slf4j.Logger;
25+
import org.slf4j.LoggerFactory;
26+
import org.w3c.dom.Document;
27+
import org.w3c.dom.Node;
28+
import org.w3c.dom.NodeList;
29+
30+
import net.sf.saxon.lib.NamespaceConstant;
31+
import net.sf.saxon.xpath.XPathEvaluator;
3032

3133
/**
3234
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>

‎webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package us.codecraft.webmagic.selector;
22

3+
import java.util.List;
4+
35
import org.htmlcleaner.HtmlCleaner;
46
import org.htmlcleaner.TagNode;
57
import org.htmlcleaner.XPatherException;
@@ -8,6 +10,7 @@
810
import org.junit.Assert;
911
import org.junit.Ignore;
1012
import org.junit.Test;
13+
1114
import us.codecraft.xsoup.XPathEvaluator;
1215
import us.codecraft.xsoup.Xsoup;
1316

@@ -1367,15 +1370,19 @@ public void testOschina() {
13671370
public void testXPath2() {
13681371
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
13691372
"<span>2013-07-31 23:29:45&nbsp;&nbsp;&nbsp;来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;责任编辑:张斯炜</span></h1>";
1370-
XpathSelector xpathSelector = new XpathSelector("//h1/text()");
1371-
System.out.println(xpathSelector.select(text));
1373+
Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()");
1374+
Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text));
13721375
}
13731376

13741377
@Test
13751378
public void testXpath2Selector() {
13761379
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
13771380
String select = xpath2Selector.select(html);
1378-
Assert.assertNotNull(select);
1381+
Assert.assertEquals("http://www.oschina.net/", select);
1382+
1383+
List<String> selectList = xpath2Selector.selectList(html);
1384+
Assert.assertEquals(113, selectList.size());
1385+
Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
13791386
}
13801387

13811388
@Ignore("take long time")

‎webmagic-scripts/pom.xml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>webmagic-parent</artifactId>
55
<groupId>us.codecraft</groupId>
6-
<version>0.7.4</version>
6+
<version>0.7.5</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

@@ -22,10 +22,6 @@
2222
<artifactId>kotlin-stdlib</artifactId>
2323
<version>${kotlin.version}</version>
2424
</dependency>
25-
<dependency>
26-
<groupId>org.codehaus.groovy</groupId>
27-
<artifactId>groovy-all</artifactId>
28-
</dependency>
2925
<dependency>
3026
<groupId>org.python</groupId>
3127
<artifactId>jython</artifactId>

‎webmagic-selenium/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>webmagic-parent</artifactId>
55
<groupId>us.codecraft</groupId>
6-
<version>0.7.4</version>
6+
<version>0.7.5</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

0 commit comments

Comments
 (0)
Please sign in to comment.